From bd414f3ff1648ade2ad4bb81eeb180be7b0e348f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 12 Jan 2022 13:38:52 +0100
Subject: [PATCH 001/504] Span/SpanGroup: wrap SpanC in shared_ptr (#9869)

* Span/SpanGroup: wrap SpanC in shared_ptr

When a Span that was retrieved from a SpanGroup was modified, these
changes were not reflected in the SpanGroup because the underlying
SpanC struct was copied.

This change applies the solution proposed by @nrodnova, to wrap SpanC in
a shared_ptr. This makes a SpanGroup and Spans derived from it share the
same SpanC. So, changes made through a Span are visible in the SpanGroup
as well.

Fixes #9556

* Test that a SpanGroup is modified through its Spans

* SpanGroup.push_back: remove nogil

Modifying std::vector is not thread-safe.

* C++ >= 11 does not allow const T in vector<T>

* Add Span.span_c as a shorthand for Span.c.get

Since this method is cdef'ed, it is only visible from Cython, so we
avoid using raw pointers in Python

Replace existing uses of span.c.get() to use this new method.

* Fix formatting

* Style fix: pointer types

* SpanGroup.to_bytes: reduce number of shared_ptr::get calls

* Mark SpanGroup modification test with issue

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/_parser_internals/ner.pyx |  32 +++---
 spacy/tests/doc/test_span.py             |  23 ++--
 spacy/tokens/span.pxd                    |  11 +-
 spacy/tokens/span.pyx                    | 131 ++++++++++++++---------
 spacy/tokens/span_group.pxd              |   5 +-
 spacy/tokens/span_group.pyx              |  22 ++--
 6 files changed, 132 insertions(+), 92 deletions(-)
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index e4312bd2f92..c77b7b50f2d 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -1,4 +1,8 @@
-# cython: profile=False
+import os
+import random
+from libc.stdint cimport int32_t
+from libcpp.memory cimport shared_ptr
+from libcpp.vector cimport vector
 from cymem.cymem cimport Pool
 from libc.stdint cimport int32_t
 
@@ -47,9 +51,7 @@ MOVE_NAMES[OUT] = 'O'
 
 cdef struct GoldNERStateC:
     Transition* ner
-    SpanC* negs
-    int32_t length
-    int32_t nr_neg
+    vector[shared_ptr[SpanC]] negs
 
 
 cdef class BiluoGold:
@@ -82,8 +84,6 @@ cdef GoldNERStateC create_gold_state(
         negs = []
     assert example.x.length > 0
     gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
-    gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
-    gs.nr_neg = len(negs)
     ner_ents, ner_tags = example.get_aligned_ents_and_ner()
     for i, ner_tag in enumerate(ner_tags):
         gs.ner[i] = moves.lookup_transition(ner_tag)
@@ -97,8 +97,8 @@ cdef GoldNERStateC create_gold_state(
     # In order to handle negative samples, we need to maintain the full
     # (start, end, label) triple. If we break it down to the 'isnt B-LOC'
     # thing, we'll get blocked if there's an incorrect prefix.
-    for i, neg in enumerate(negs):
-        gs.negs[i] = neg.c
+    for neg in negs:
+        gs.negs.push_back(neg.c)
     return gs
 
 
@@ -413,6 +413,8 @@ cdef class Begin:
         cdef int g_act = gold.ner[b0].move
         cdef attr_t g_tag = gold.ner[b0].label
 
+        cdef shared_ptr[SpanC] span
+
         if g_act == MISSING:
             pass
         elif g_act == BEGIN:
@@ -430,8 +432,8 @@ cdef class Begin:
             # be correct or not. However, we can at least tell whether we're
             # going to be opening an entity where there's only one possible
             # L.
-            for span in gold.negs[:gold.nr_neg]:
-                if span.label == label and span.start == b0:
+            for span in gold.negs:
+                if span.get().label == label and span.get().start == b0:
                     cost += 1
                     break
         return cost
@@ -572,8 +574,9 @@ cdef class Last:
         # If we have negative-example entities, integrate them into the objective,
         # by marking actions that close an entity that we know is incorrect
         # as costly.
-        for span in gold.negs[:gold.nr_neg]:
-            if span.label == label and (span.end-1) == b0 and span.start == ent_start:
+        cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
                 cost += 1
                 break
         return cost
@@ -637,8 +640,9 @@ cdef class Unit:
         # This is fairly straight-forward for U- entities, as we have a single
         # action
         cdef int b0 = s.B(0)
-        for span in gold.negs[:gold.nr_neg]:
-            if span.label == label and span.start == b0 and span.end == (b0+1):
+        cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
                 cost += 1
                 break
         return cost
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 98a74bc2145..e5c71dafcf7 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -5,7 +5,8 @@
 
 from spacy.attrs import LENGTH, ORTH
 from spacy.lang.en import English
-from spacy.tokens import Doc, Span, Token
+from spacy.tokens import Doc, Span, SpanGroup, Token
+from spacy.vocab import Vocab
 from spacy.util import filter_spans
 from spacy.vocab import Vocab
 
@@ -163,16 +164,16 @@ def test_char_span(doc, i_sent, i, j, text):
         assert span.text == text
 
 
-def test_char_span_attributes(doc):
-    label = "LABEL"
-    kb_id = "KB_ID"
-    span_id = "SPAN_ID"
-    span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
-    span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
-    assert span1.text == span2.text
-    assert span1.label_ == span2.label_ == label
-    assert span1.kb_id_ == span2.kb_id_ == kb_id
-    assert span1.id_ == span2.id_ == span_id
+@pytest.mark.issue(9556)
+def test_modify_span_group(doc):
+    group = SpanGroup(doc, spans=doc.ents)
+    for span in group:
+        span.start = 0
+        span.label = doc.vocab.strings["TEST"]
+
+    # Span changes must be reflected in the span group
+    assert group[0].start == 0
+    assert group[0].label == doc.vocab.strings["TEST"]
 
 
 def test_spans_sent_spans(doc):
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index d77bbea7035..ce318ed0dfb 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,3 +1,4 @@
+from libcpp.memory cimport shared_ptr
 cimport numpy as np
 
 from ..structs cimport SpanC
@@ -7,19 +8,21 @@ from .doc cimport Doc
 
 cdef class Span:
     cdef readonly Doc doc
-    cdef SpanC c
+    cdef shared_ptr[SpanC] c
     cdef public _vector
     cdef public _vector_norm
 
     @staticmethod
-    cdef inline Span cinit(Doc doc, SpanC span):
+    cdef inline Span cinit(Doc doc, const shared_ptr[SpanC] &span):
         cdef Span self = Span.__new__(
             Span,
             doc,
-            start=span.start,
-            end=span.end
+            start=span.get().start,
+            end=span.get().end
         )
         self.c = span
         return self
 
     cpdef np.ndarray to_array(self, object features)
+
+    cdef SpanC* span_c(self)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index e179bbce7eb..17c4c4c6059 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -1,8 +1,7 @@
 # cython: profile=False
 cimport numpy as np
-
-import copy
-import warnings
+from libc.math cimport sqrt
+from libcpp.memory cimport make_shared
 
 import numpy
 from thinc.api import get_array_module
@@ -115,7 +114,7 @@ cdef class Span:
             end_char = start_char
         else:
             end_char = doc[end - 1].idx + len(doc[end - 1])
-        self.c = SpanC(
+        self.c = make_shared[SpanC](SpanC(
             label=label,
             kb_id=kb_id,
             id=span_id,
@@ -123,7 +122,7 @@ cdef class Span:
             end=end,
             start_char=start_char,
             end_char=end_char,
-        )
+        ))
         self._vector = vector
         self._vector_norm = vector_norm
 
@@ -133,32 +132,46 @@ cdef class Span:
                 return False
             else:
                 return True
-        if not isinstance(other, Span):
-            return False
-        cdef Span other_span = other
-        self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc)
-        other_tuple = (other_span.c.start_char, other_span.c.end_char, other_span.c.label, other_span.c.kb_id, other_span.id, other_span.doc)
+
+        cdef SpanC* span_c = self.span_c()
+        cdef SpanC* other_span_c = other.span_c()
+
         # <
         if op == 0:
-            return self_tuple < other_tuple
+            return span_c.start_char < other_span_c.start_char
         # <=
         elif op == 1:
-            return self_tuple <= other_tuple
+            return span_c.start_char <= other_span_c.start_char
         # ==
         elif op == 2:
-            return self_tuple == other_tuple
+            # Do the cheap comparisons first
+            return (
+                (span_c.start_char == other_span_c.start_char) and \
+                (span_c.end_char == other_span_c.end_char) and \
+                (span_c.label == other_span_c.label) and \
+                (span_c.kb_id == other_span_c.kb_id) and \
+                (self.doc == other.doc)
+            )
         # !=
         elif op == 3:
-            return self_tuple != other_tuple
+            # Do the cheap comparisons first
+            return not (
+                (span_c.start_char == other_span_c.start_char) and \
+                (span_c.end_char == other_span_c.end_char) and \
+                (span_c.label == other_span_c.label) and \
+                (span_c.kb_id == other_span_c.kb_id) and \
+                (self.doc == other.doc)
+            )
         # >
         elif op == 4:
-            return self_tuple > other_tuple
+            return span_c.start_char > other_span_c.start_char
         # >=
         elif op == 5:
-            return self_tuple >= other_tuple
+            return span_c.start_char >= other_span_c.start_char
 
     def __hash__(self):
-        return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.c.id))
+        cdef SpanC* span_c = self.span_c()
+        return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id))
 
     def __len__(self):
         """Get the number of tokens in the span.
@@ -167,9 +180,10 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#len
         """
-        if self.c.end < self.c.start:
+        cdef SpanC* span_c = self.span_c()
+        if span_c.end < span_c.start:
             return 0
-        return self.c.end - self.c.start
+        return span_c.end - span_c.start
 
     def __repr__(self):
         return self.text
@@ -183,15 +197,16 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#getitem
         """
+        cdef SpanC* span_c = self.span_c()
         if isinstance(i, slice):
             start, end = normalize_slice(len(self), i.start, i.stop, i.step)
             return Span(self.doc, start + self.start, end + self.start)
         else:
             if i < 0:
-                token_i = self.c.end + i
+                token_i = span_c.end + i
             else:
-                token_i = self.c.start + i
-            if self.c.start <= token_i < self.c.end:
+                token_i = span_c.start + i
+            if span_c.start <= token_i < span_c.end:
                 return self.doc[token_i]
             else:
                 raise IndexError(Errors.E1002)
@@ -203,7 +218,8 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#iter
         """
-        for i in range(self.c.start, self.c.end):
+        cdef SpanC* span_c = self.span_c()
+        for i in range(span_c.start, span_c.end):
             yield self.doc[i]
 
     def __reduce__(self):
@@ -211,9 +227,10 @@ cdef class Span:
 
     @property
     def _(self):
+        cdef SpanC* span_c = self.span_c()
         """Custom extension attributes registered via `set_extension`."""
         return Underscore(Underscore.span_extensions, self,
-                          start=self.c.start_char, end=self.c.end_char)
+                          start=span_c.start_char, end=span_c.end_char)
 
     def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None):
         """Create a `Doc` object with a copy of the `Span`'s data.
@@ -287,13 +304,14 @@ cdef class Span:
         cdef int length = len(array)
         cdef attr_t value
         cdef int i, head_col, ancestor_i
+        cdef SpanC* span_c = self.span_c()
         old_to_new_root = dict()
         if HEAD in attrs:
             head_col = attrs.index(HEAD)
             for i in range(length):
                 # if the HEAD refers to a token outside this span, find a more appropriate ancestor
                 token = self[i]
-                ancestor_i = token.head.i - self.c.start   # span offset
+                ancestor_i = token.head.i - span_c.start   # span offset
                 if ancestor_i not in range(length):
                     if DEP in attrs:
                         array[i, attrs.index(DEP)] = dep
@@ -301,7 +319,7 @@ cdef class Span:
                     # try finding an ancestor within this span
                     ancestors = token.ancestors
                     for ancestor in ancestors:
-                        ancestor_i = ancestor.i - self.c.start
+                        ancestor_i = ancestor.i - span_c.start
                         if ancestor_i in range(length):
                             array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
 
@@ -330,7 +348,8 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#get_lca_matrix
         """
-        return numpy.asarray(_get_lca_matrix(self.doc, self.c.start, self.c.end))
+        cdef SpanC* span_c = self.span_c()
+        return numpy.asarray(_get_lca_matrix(self.doc, span_c.start, span_c.end))
 
     def similarity(self, other):
         """Make a semantic similarity estimate. The default estimate is cosine
@@ -440,6 +459,9 @@ cdef class Span:
         else:
             raise ValueError(Errors.E030)
 
+    cdef SpanC* span_c(self):
+        return self.c.get()
+
     @property
     def sents(self):
         """Obtain the sentences that contain this span. If the given span
@@ -494,10 +516,13 @@ cdef class Span:
         DOCS: https://spacy.io/api/span#ents
         """
         cdef Span ent
+        cdef SpanC* span_c = self.span_c()
+        cdef SpanC* ent_span_c
         ents = []
         for ent in self.doc.ents:
-            if ent.c.start >= self.c.start:
-                if ent.c.end <= self.c.end:
+            ent_span_c = ent.span_c()
+            if ent_span_c.start >= span_c.start:
+                if ent_span_c.end <= span_c.end:
                     ents.append(ent)
                 else:
                     break
@@ -631,11 +656,12 @@ cdef class Span:
         # This should probably be called 'head', and the other one called
         # 'gov'. But we went with 'head' elsewhere, and now we're stuck =/
         cdef int i
+        cdef SpanC* span_c = self.span_c()
         # First, we scan through the Span, and check whether there's a word
         # with head==0, i.e. a sentence root. If so, we can return it. The
         # longer the span, the more likely it contains a sentence root, and
         # in this case we return in linear time.
-        for i in range(self.c.start, self.c.end):
+        for i in range(span_c.start, span_c.end):
             if self.doc.c[i].head == 0:
                 return self.doc[i]
         # If we don't have a sentence root, we do something that's not so
@@ -646,15 +672,15 @@ cdef class Span:
         # think this should be okay.
         cdef int current_best = self.doc.length
         cdef int root = -1
-        for i in range(self.c.start, self.c.end):
-            if self.c.start <= (i+self.doc.c[i].head) < self.c.end:
+        for i in range(span_c.start, span_c.end):
+            if span_c.start <= (i+self.doc.c[i].head) < span_c.end:
                 continue
             words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
             if words_to_root < current_best:
                 current_best = words_to_root
                 root = i
         if root == -1:
-            return self.doc[self.c.start]
+            return self.doc[span_c.start]
         else:
             return self.doc[root]
 
@@ -677,9 +703,10 @@ cdef class Span:
         span_id (Union[int, str]): An identifier to associate with the span.
         RETURNS (Span): The newly constructed object.
         """
-        start_idx += self.c.start_char
-        end_idx += self.c.start_char
-        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
+        cdef SpanC* span_c = self.span_c()
+        start_idx += span_c.start_char
+        end_idx += span_c.start_char
+        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
 
     @property
     def conjuncts(self):
@@ -759,53 +786,53 @@ cdef class Span:
 
     property start:
         def __get__(self):
-            return self.c.start
+            return self.span_c().start
 
         def __set__(self, int start):
             if start < 0:
-                raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
-            self.c.start = start
+                raise IndexError("TODO")
+            self.span_c().start = start
 
     property end:
         def __get__(self):
-            return self.c.end
+            return self.span_c().end
 
         def __set__(self, int end):
             if end < 0:
-                raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
-            self.c.end = end
+                raise IndexError("TODO")
+            self.span_c().end = end
 
     property start_char:
         def __get__(self):
-            return self.c.start_char
+            return self.span_c().start_char
 
         def __set__(self, int start_char):
             if start_char < 0:
-                raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
-            self.c.start_char = start_char
+                raise IndexError("TODO")
+            self.span_c().start_char = start_char
 
     property end_char:
         def __get__(self):
-            return self.c.end_char
+            return self.span_c().end_char
 
         def __set__(self, int end_char):
             if end_char < 0:
-                raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
-            self.c.end_char = end_char
+                raise IndexError("TODO")
+            self.span_c().end_char = end_char
 
     property label:
         def __get__(self):
-            return self.c.label
+            return self.span_c().label
 
         def __set__(self, attr_t label):
-            self.c.label = label
+            self.span_c().label = label
 
     property kb_id:
         def __get__(self):
-            return self.c.kb_id
+            return self.span_c().kb_id
 
         def __set__(self, attr_t kb_id):
-            self.c.kb_id = kb_id
+            self.span_c().kb_id = kb_id
 
     property id:
         def __get__(self):
diff --git a/spacy/tokens/span_group.pxd b/spacy/tokens/span_group.pxd
index 7f4145682eb..6f0ffd0eb36 100644
--- a/spacy/tokens/span_group.pxd
+++ b/spacy/tokens/span_group.pxd
@@ -1,3 +1,4 @@
+from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
 
 from ..structs cimport SpanC
@@ -7,6 +8,6 @@ cdef class SpanGroup:
     cdef public object _doc_ref
     cdef public str name
     cdef public dict attrs
-    cdef vector[SpanC] c
+    cdef vector[shared_ptr[SpanC]] c
 
-    cdef void push_back(self, SpanC span) nogil
+    cdef void push_back(self, const shared_ptr[SpanC] &span)
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 257c907bcce..8a524926a03 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -9,6 +9,8 @@ import srsly
 from spacy.errors import Errors
 
 from .span cimport Span
+from libc.stdint cimport uint64_t, uint32_t, int32_t
+from libcpp.memory cimport make_shared
 
 
 cdef class SpanGroup:
@@ -202,10 +204,12 @@ cdef class SpanGroup:
 
         DOCS: https://spacy.io/api/spangroup#to_bytes
         """
+        cdef SpanC* span_c
         output = {"name": self.name, "attrs": self.attrs, "spans": []}
         cdef int i
         for i in range(self.c.size()):
             span = self.c[i]
+            span_c = span.get()
             # The struct.pack here is probably overkill, but it might help if
             # you're saving tonnes of spans, and it doesn't really add any
             # complexity. We do take care to specify little-endian byte order
@@ -217,13 +221,13 @@ cdef class SpanGroup:
             # l: int32_t
             output["spans"].append(struct.pack(
                 ">QQQllll",
-                span.id,
-                span.kb_id,
-                span.label,
-                span.start,
-                span.end,
-                span.start_char,
-                span.end_char
+                span_c.id,
+                span_c.kb_id,
+                span_c.label,
+                span_c.start,
+                span_c.end,
+                span_c.start_char,
+                span_c.end_char
             ))
         return srsly.msgpack_dumps(output)
 
@@ -250,10 +254,10 @@ cdef class SpanGroup:
             span.end = items[4]
             span.start_char = items[5]
             span.end_char = items[6]
-            self.c.push_back(span)
+            self.c.push_back(make_shared[SpanC](span))
         return self
 
-    cdef void push_back(self, SpanC span) nogil:
+    cdef void push_back(self, const shared_ptr[SpanC] &span):
         self.c.push_back(span)
 
     def copy(self, doc: Optional["Doc"] = None) -> SpanGroup:

From 835a20e962a2c1150d517f636eea0253ee5e45b5 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 15 Apr 2022 15:34:58 +0200
Subject: [PATCH 002/504] Return doc offsets in Matcher on spans (#10576)

The returned match offsets were only adjusted for `as_spans`, not
generally. Because the `on_match` callbacks are always applied to the
doc, the `Matcher` matches on spans should consistently use the doc
offsets.
---
 spacy/matcher/matcher.pyx               |  7 ++++---
 spacy/tests/matcher/test_matcher_api.py | 13 ++++++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 9a9ed421223..f0116169a6b 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -275,6 +275,10 @@ cdef class Matcher:
         # non-overlapping ones this `match` can be either (start, end) or
         # (start, end, alignments) depending on `with_alignments=` option.
         for key, *match in matches:
+            # Adjust span matches to doc offsets
+            if isinstance(doclike, Span):
+                match[0] += doclike.start
+                match[1] += doclike.start
             span_filter = self._filter.get(key)
             if span_filter is not None:
                 pairs = pairs_by_id.get(key, [])
@@ -305,9 +309,6 @@ cdef class Matcher:
         if as_spans:
             final_results = []
             for key, start, end, *_ in final_matches:
-                if isinstance(doclike, Span):
-                    start += doclike.start
-                    end += doclike.start
                 final_results.append(Span(doc, start, end, label=key))
         elif with_alignments:
             # convert alignments List[Dict[str, int]] --> List[int]
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index c824ca39253..106a00b3011 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -794,9 +794,16 @@ def test_matcher_span(matcher):
     doc = Doc(matcher.vocab, words=text.split())
     span_js = doc[:3]
     span_java = doc[4:]
-    assert len(matcher(doc)) == 2
-    assert len(matcher(span_js)) == 1
-    assert len(matcher(span_java)) == 1
+    doc_matches = matcher(doc)
+    span_js_matches = matcher(span_js)
+    span_java_matches = matcher(span_java)
+    assert len(doc_matches) == 2
+    assert len(span_js_matches) == 1
+    assert len(span_java_matches) == 1
+
+    # match offsets always refer to the doc
+    assert doc_matches[0] == span_js_matches[0]
+    assert doc_matches[1] == span_java_matches[0]
 
 
 def test_matcher_as_spans(matcher):

From ad4ba09a3ce19e76d4c1b10f27b287a27ef67ad1 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 15 Jul 2022 11:14:08 +0200
Subject: [PATCH 003/504] `Morphology`/`Morphologizer` optimizations and
 refactoring (#11024)

* `Morphology`: Refactor to use C types, reduce allocations, remove unused code

* `Morphologzier`: Avoid unnecessary sorting of morpho features

* `Morphologizer`: Remove execessive reallocations of labels, improve hash lookups of labels, coerce `numpy` numeric types to native ints
Update docs

* Remove unused method

* Replace `unique_ptr` usage with `shared_ptr`

* Add type annotations to internal Python methods, rename `hash` variable, fix typos

* Add comment to clarify implementation detail

* Fix return type

* `Morphology`: Stop early when splitting fields and values
---
 spacy/morphology.pxd               |  47 +++--
 spacy/morphology.pyx               | 274 +++++++++++++++++------------
 spacy/pipeline/morphologizer.pyx   |  30 ++--
 spacy/structs.pxd                  |   8 -
 spacy/tokens/morphanalysis.pxd     |   9 +-
 spacy/tokens/morphanalysis.pyx     |  40 +++--
 spacy/tokens/token.pyx             |   3 +-
 website/docs/api/morphologizer.mdx |   2 +-
 8 files changed, 240 insertions(+), 173 deletions(-)

diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index ee43aa4ec81..494088879b1 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,27 +1,42 @@
 cimport numpy as np
-from cymem.cymem cimport Pool
-from libc.stdint cimport uint64_t
-from preshed.maps cimport PreshMap
+from libc.stdint cimport uint32_t, uint64_t
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector
+from libcpp.memory cimport shared_ptr
 
 from .strings cimport StringStore
 from .structs cimport MorphAnalysisC
 from .typedefs cimport attr_t, hash_t
 
 
+cdef cppclass Feature:
+    hash_t field
+    hash_t value
+
+    __init__():
+        this.field = 0
+        this.value = 0
+
+
+cdef cppclass MorphAnalysisC:
+    hash_t key  
+    vector[Feature] features
+
+    __init__():
+        this.key = 0
+
 cdef class Morphology:
-    cdef readonly Pool mem
     cdef readonly StringStore strings
-    cdef PreshMap tags  # Keyed by hash, value is pointer to tag
-
-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
-    cdef int insert(self, MorphAnalysisC tag) except -1
+    cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags
 
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash)
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats)
+    cdef hash_t _add(self, features)
+    cdef str _normalize_features(self, features)
+    cdef str get_morph_str(self, hash_t morph_key)
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key)
 
-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
-cdef list list_features(const MorphAnalysisC* morph)
-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
-cdef int get_n_by_field(
-    attr_t* results,
-    const MorphAnalysisC* morph,
-    attr_t field,
-) nogil
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph)
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field)
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index cef45b04d14..7ee621056f1 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,11 +1,11 @@
 # cython: infer_types
 # cython: profile=False
 import warnings
+from typing import Union, Tuple, List, Dict, Optional
+from cython.operator cimport dereference as deref
+from libcpp.memory cimport shared_ptr
 
-import numpy
-
-from .attrs cimport POS
-
+from .errors import Warnings
 from . import symbols
 from .errors import Warnings
 from .parts_of_speech import IDS as POS_IDS
@@ -26,135 +26,187 @@ cdef class Morphology:
     EMPTY_MORPH = symbols.NAMES[symbols._]
 
     def __init__(self, StringStore strings):
-        self.mem = Pool()
         self.strings = strings
-        self.tags = PreshMap()
 
     def __reduce__(self):
         tags = set([self.get(self.strings[s]) for s in self.strings])
         tags -= set([""])
         return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
 
-    def add(self, features):
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash):
+        match = self.tags.find(tag_hash)
+        if match != self.tags.const_end():
+            return deref(match).second
+        else:
+            return shared_ptr[MorphAnalysisC]()
+
+    def _normalize_attr(self, attr_key : Union[int, str], attr_value : Union[int, str]) -> Optional[Tuple[str, Union[str, List[str]]]]:
+        if isinstance(attr_key, (int, str)) and isinstance(attr_value, (int, str)):
+            attr_key = self.strings.as_string(attr_key)
+            attr_value = self.strings.as_string(attr_value)
+
+            # Preserve multiple values as a list
+            if self.VALUE_SEP in attr_value:
+                values = attr_value.split(self.VALUE_SEP)
+                values.sort()
+                attr_value = values
+        else:
+            warnings.warn(Warnings.W100.format(feature={attr_key: attr_value}))
+            return None
+
+        return attr_key, attr_value
+
+    def _str_to_normalized_feat_dict(self, feats: str) -> Dict[str, str]:
+        if not feats or feats == self.EMPTY_MORPH:
+            return {}
+
+        out = []
+        for feat in feats.split(self.FEATURE_SEP):
+            field, values = feat.split(self.FIELD_SEP, 1)
+            normalized_attr = self._normalize_attr(field, values)
+            if normalized_attr is None:
+                continue
+            out.append((normalized_attr[0], normalized_attr[1]))
+        out.sort(key=lambda x: x[0])
+        return dict(out)
+
+    def _dict_to_normalized_feat_dict(self, feats: Dict[Union[int, str], Union[int, str]]) -> Dict[str, str]:
+        out = []
+        for field, values in feats.items():
+            normalized_attr = self._normalize_attr(field, values)
+            if normalized_attr is None:
+                continue
+            out.append((normalized_attr[0], normalized_attr[1]))
+        out.sort(key=lambda x: x[0])
+        return dict(out)
+
+
+    def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
+        norm_feats_string = self.FEATURE_SEP.join([
+                self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
+            for field, values in feats.items()
+        ])
+        return norm_feats_string or self.EMPTY_MORPH
+
+
+    cdef hash_t _add(self, features):
         """Insert a morphological analysis in the morphology table, if not
         already present. The morphological analysis may be provided in the UD
         FEATS format as a string or in the tag map dict format.
         Returns the hash of the new analysis.
         """
-        cdef MorphAnalysisC* tag_ptr
+        cdef hash_t tag_hash = 0
+        cdef shared_ptr[MorphAnalysisC] tag
         if isinstance(features, str):
             if features == "":
                 features = self.EMPTY_MORPH
-            tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
-            if tag_ptr != NULL:
-                return tag_ptr.key
-            features = self.feats_to_dict(features)
-        if not isinstance(features, dict):
+
+            tag_hash = self.strings[features]
+            tag = self._lookup_tag(tag_hash)
+            if tag:
+                return deref(tag).key
+
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
+        else:
             warnings.warn(Warnings.W100.format(feature=features))
             features = {}
-        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # intified ("Field", "Field=Value") pairs
-        field_feature_pairs = []
-        for field in sorted(string_features):
-            values = string_features[field]
-            for value in values.split(self.VALUE_SEP):
-                field_feature_pairs.append((
-                    self.strings.add(field),
-                    self.strings.add(field + self.FIELD_SEP + value),
-                ))
-        cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
+
         # the hash key for the tag is either the hash of the normalized UFEATS
         # string or the hash of an empty placeholder
-        norm_feats_string = self.normalize_features(features)
-        tag.key = self.strings.add(norm_feats_string)
-        self.insert(tag)
-        return tag.key
+        norm_feats_string = self._normalized_feat_dict_to_str(features)
+        tag_hash = self.strings.add(norm_feats_string)
+        tag = self._lookup_tag(tag_hash)
+        if tag:
+            return deref(tag).key
+
+        self._intern_morph_tag(tag_hash, features)
+        return tag_hash
+
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats):
+        # intified ("Field", "Field=Value") pairs where fields with multiple values have
+        # been split into individual tuples, e.g.:
+        # [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
+        # ("Field2", "Field2=Value3")]
+        field_feature_pairs = []
 
-    def normalize_features(self, features):
+        # Feat dict is normalized at this point.
+        for field, values in feats.items():
+            field_key = self.strings.add(field)
+            if isinstance(values, list):
+                for value in values:
+                    value_key = self.strings.add(field + self.FIELD_SEP + value)
+                    field_feature_pairs.append((field_key, value_key))
+            else:
+                # We could box scalar values into a list and use a common
+                # code path to generate features but that incurs a small 
+                # but measurable allocation/iteration overhead (as this
+                # branch is taken often enough).
+                value_key = self.strings.add(field + self.FIELD_SEP + values)
+                field_feature_pairs.append((field_key, value_key))
+
+        num_features = len(field_feature_pairs)
+        cdef shared_ptr[MorphAnalysisC] tag = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
+        deref(tag).key = tag_key
+        deref(tag).features.resize(num_features)
+
+        for i in range(num_features):
+            deref(tag).features[i].field = field_feature_pairs[i][0]
+            deref(tag).features[i].value = field_feature_pairs[i][1]
+
+        self.tags[tag_key] = tag
+
+    cdef str get_morph_str(self, hash_t morph_key):
+        cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key)
+        if not tag:
+            return ""
+        else:
+            return self.strings[deref(tag).key]
+
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
+        return self._lookup_tag(morph_key)
+
+    cdef str _normalize_features(self, features):
         """Create a normalized FEATS string from a features string or dict.
 
         features (Union[dict, str]): Features as dict or UFEATS string.
         RETURNS (str): Features as normalized UFEATS string.
         """
         if isinstance(features, str):
-            features = self.feats_to_dict(features)
-        if not isinstance(features, dict):
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
+        else:
             warnings.warn(Warnings.W100.format(feature=features))
             features = {}
-        features = self.normalize_attrs(features)
-        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # normalized UFEATS string with sorted fields and values
-        norm_feats_string = self.FEATURE_SEP.join(
-            sorted(
-                [self.FIELD_SEP.join([field, values]) for field, values in string_features.items()]
-            )
-        )
-        return norm_feats_string or self.EMPTY_MORPH
 
-    def normalize_attrs(self, attrs):
-        """Convert attrs dict so that POS is always by ID, other features are
-        by string. Values separated by VALUE_SEP are sorted.
-        """
-        out = {}
-        attrs = dict(attrs)
-        for key, value in attrs.items():
-            # convert POS value to ID
-            if key == POS or (isinstance(key, str) and key.upper() == "POS"):
-                if isinstance(value, str) and value.upper() in POS_IDS:
-                    value = POS_IDS[value.upper()]
-                elif isinstance(value, int) and value not in POS_IDS.values():
-                    warnings.warn(Warnings.W100.format(feature={key: value}))
-                    continue
-                out[POS] = value
-            # accept any string or ID fields and values and convert to strings
-            elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
-                key = self.strings.as_string(key)
-                value = self.strings.as_string(value)
-                # sort values
-                if self.VALUE_SEP in value:
-                    value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
-                out[key] = value
-            else:
-                warnings.warn(Warnings.W100.format(feature={key: value}))
-        return out
+        return self._normalized_feat_dict_to_str(features)
 
-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
-        """Creates a MorphAnalysisC from a list of intified
-        ("Field", "Field=Value") tuples where fields with multiple values have
-        been split into individual tuples, e.g.:
-        [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
-        ("Field2", "Field2=Value3")]
-        """
-        cdef MorphAnalysisC tag
-        tag.length = len(field_feature_pairs)
-        if tag.length > 0:
-            tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
-            tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
-        for i, (field, feature) in enumerate(field_feature_pairs):
-            tag.fields[i] = field
-            tag.features[i] = feature
-        return tag
-
-    cdef int insert(self, MorphAnalysisC tag) except -1:
-        cdef hash_t key = tag.key
-        if self.tags.get(key) == NULL:
-            tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
-            tag_ptr[0] = tag
-            self.tags.set(key, <void*>tag_ptr)
-
-    def get(self, hash_t morph):
-        tag = <MorphAnalysisC*>self.tags.get(morph)
-        if tag == NULL:
-            return ""
-        else:
-            return self.strings[tag.key]
+    def add(self, features):
+        return self._add(features)
+
+    def get(self, morph_key):
+        return self.get_morph_str(morph_key)
+
+    def normalize_features(self, features):
+        return self._normalize_features(features)
 
     @staticmethod
-    def feats_to_dict(feats):
+    def feats_to_dict(feats, *, sort_values=True):
         if not feats or feats == Morphology.EMPTY_MORPH:
             return {}
-        return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
-                [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
+
+        out = {}
+        for feat in feats.split(Morphology.FEATURE_SEP):
+            field, values = feat.split(Morphology.FIELD_SEP, 1)
+            if sort_values:
+                values = values.split(Morphology.VALUE_SEP)
+                values.sort()
+                values = Morphology.VALUE_SEP.join(values)
+
+            out[field] = values
+        return out
 
     @staticmethod
     def dict_to_feats(feats_dict):
@@ -163,34 +215,34 @@ cdef class Morphology:
         return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
 
 
-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
     cdef int i
-    for i in range(morph.length):
-        if morph.features[i] == feature:
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].value == feature:
             return True
     return False
 
 
-cdef list list_features(const MorphAnalysisC* morph):
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
     cdef int i
     features = []
-    for i in range(morph.length):
-        features.append(morph.features[i])
+    for i in range(deref(morph).features.size()):
+        features.append(deref(morph).features[i].value)
     return features
 
 
-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field):
-    cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64")
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
+    cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
     n = get_n_by_field(<uint64_t*>results.data, morph, field)
     return results[:n]
 
 
-cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil:
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
     cdef int n_results = 0
     cdef int i
-    for i in range(morph.length):
-        if morph.fields[i] == field:
-            results[n_results] = morph.features[i]
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].field == field:
+            results[n_results] = deref(morph).features[i].value
             n_results += 1
     return n_results
 
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index d415ae43c5c..bdbe75fd824 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -132,8 +132,8 @@ class Morphologizer(Tagger):
 
     @property
     def labels(self):
-        """RETURNS (Tuple[str]): The labels currently added to the component."""
-        return tuple(self.cfg["labels_morph"].keys())
+        """RETURNS (Iterable[str]): The labels currently added to the component."""
+        return self.cfg["labels_morph"].keys()
 
     @property
     def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
@@ -156,7 +156,7 @@ class Morphologizer(Tagger):
         # normalize label
         norm_label = self.vocab.morphology.normalize_features(label)
         # extract separate POS and morph tags
-        label_dict = Morphology.feats_to_dict(label)
+        label_dict = Morphology.feats_to_dict(label, sort_values=False)
         pos = label_dict.get(self.POS_FEAT, "")
         if self.POS_FEAT in label_dict:
             label_dict.pop(self.POS_FEAT)
@@ -194,7 +194,7 @@ class Morphologizer(Tagger):
                         continue
                     morph = str(token.morph)
                     # create and add the combined morph+POS label
-                    morph_dict = Morphology.feats_to_dict(morph)
+                    morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                     if pos:
                         morph_dict[self.POS_FEAT] = pos
                     norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@@ -211,7 +211,7 @@ class Morphologizer(Tagger):
             for i, token in enumerate(example.reference):
                 pos = token.pos_
                 morph = str(token.morph)
-                morph_dict = Morphology.feats_to_dict(morph)
+                morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                 if pos:
                     morph_dict[self.POS_FEAT] = pos
                 norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@@ -235,26 +235,29 @@ class Morphologizer(Tagger):
         cdef Doc doc
         cdef bint overwrite = self.cfg["overwrite"]
         cdef bint extend = self.cfg["extend"]
-        labels = self.labels
+
+        # We require random access for the upcoming ops, so we need
+        # to allocate a compatible container out of the iterable.
+        labels = tuple(self.labels)
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
-                morph = labels[tag_id]
+                morph = labels[int(tag_id)]
                 # set morph
                 if doc.c[j].morph == 0 or overwrite or extend:
                     if overwrite and extend:
                         # morphologizer morph overwrites any existing features
                         # while extending
-                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
-                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
+                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False)
+                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False))
                         doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                     elif extend:
                         # existing features are preserved and any new features
                         # are added
-                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
-                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
+                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False)
+                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False))
                         doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                     else:
                         # clobber
@@ -274,8 +277,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
-                                                    label_smoothing=self.cfg["label_smoothing"])
+        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
         truths = []
         for eg in examples:
             eg_truths = []
@@ -296,7 +298,7 @@ class Morphologizer(Tagger):
                     label = None
                 # Otherwise, generate the combined label
                 else:
-                    label_dict = Morphology.feats_to_dict(morph)
+                    label_dict = Morphology.feats_to_dict(morph, sort_values=False)
                     if pos:
                         label_dict[self.POS_FEAT] = pos
                     label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 8cfcc2964f6..e7513cc11b7 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -57,14 +57,6 @@ cdef struct TokenC:
     hash_t ent_id
 
 
-cdef struct MorphAnalysisC:
-    hash_t key
-    int length
-
-    attr_t* fields
-    attr_t* features
-
-
 # Internal struct, for storage and disambiguation of entities.
 cdef struct KBEntryC:
 
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
index 728f0aaf75a..f866488ecc2 100644
--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@@ -1,9 +1,12 @@
-from ..structs cimport MorphAnalysisC
-from ..typedefs cimport hash_t
 from ..vocab cimport Vocab
+from ..typedefs cimport hash_t
+from ..morphology cimport MorphAnalysisC
+from libcpp.memory cimport shared_ptr
 
 
 cdef class MorphAnalysis:
     cdef readonly Vocab vocab
     cdef readonly hash_t key
-    cdef MorphAnalysisC c
+    cdef shared_ptr[MorphAnalysisC] c
+
+    cdef void _init_c(self, hash_t key)
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index ea5d07fa449..ceaa3ecd04e 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -8,6 +8,13 @@ from ..morphology import Morphology
 from ..morphology cimport check_feature, get_by_field, list_features
 from ..typedefs cimport attr_t, hash_t
 from ..vocab cimport Vocab
+from ..typedefs cimport hash_t, attr_t
+from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
+from libcpp.memory cimport shared_ptr
+from cython.operator cimport dereference as deref
+
+
+cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
 
 
 cdef class MorphAnalysis:
@@ -15,39 +22,38 @@ cdef class MorphAnalysis:
     def __init__(self, Vocab vocab, features=dict()):
         self.vocab = vocab
         self.key = self.vocab.morphology.add(features)
-        analysis = <const MorphAnalysisC*>self.vocab.morphology.tags.get(self.key)
-        if analysis is not NULL:
-            self.c = analysis[0]
+        self._init_c(self.key)
+
+    cdef void _init_c(self, hash_t key):
+        cdef shared_ptr[MorphAnalysisC] analysis = self.vocab.morphology.get_morph_c(key)
+        if analysis:
+            self.c = analysis
         else:
-            memset(&self.c, 0, sizeof(self.c))
+            self.c = EMPTY_MORPH_TAG
 
     @classmethod
     def from_id(cls, Vocab vocab, hash_t key):
         """Create a morphological analysis from a given ID."""
-        cdef MorphAnalysis morph = MorphAnalysis.__new__(MorphAnalysis, vocab)
+        cdef MorphAnalysis morph = MorphAnalysis(vocab)
         morph.vocab = vocab
         morph.key = key
-        analysis = <const MorphAnalysisC*>vocab.morphology.tags.get(key)
-        if analysis is not NULL:
-            morph.c = analysis[0]
-        else:
-            memset(&morph.c, 0, sizeof(morph.c))
+        morph._init_c(key)
         return morph
 
     def __contains__(self, feature):
         """Test whether the morphological analysis contains some feature."""
         cdef attr_t feat_id = self.vocab.strings.as_int(feature)
-        return check_feature(&self.c, feat_id)
+        return check_feature(self.c, feat_id)
 
     def __iter__(self):
         """Iterate over the features in the analysis."""
         cdef attr_t feature
-        for feature in list_features(&self.c):
+        for feature in list_features(self.c):
             yield self.vocab.strings[feature]
 
     def __len__(self):
         """The number of features in the analysis."""
-        return self.c.length
+        return deref(self.c).features.size()
 
     def __hash__(self):
         return self.key
@@ -63,11 +69,7 @@ cdef class MorphAnalysis:
     def get(self, field, default=None):
         """Retrieve feature values by field."""
         cdef attr_t field_id = self.vocab.strings.as_int(field)
-        cdef np.ndarray results = get_by_field(&self.c, field_id)
-        if len(results) == 0:
-            if default is None:
-                default = []
-            return default
+        cdef np.ndarray results = get_by_field(self.c, field_id)
         features = [self.vocab.strings[result] for result in results]
         return [f.split(Morphology.FIELD_SEP)[1] for f in features]
 
@@ -75,7 +77,7 @@ cdef class MorphAnalysis:
         """Produce a json serializable representation as a UD FEATS-style
         string.
         """
-        morph_string = self.vocab.strings[self.c.key]
+        morph_string = self.vocab.strings[deref(self.c).key]
         if morph_string == self.vocab.morphology.EMPTY_MORPH:
             return ""
         return morph_string
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 2ed736b7035..c0cd0af42c0 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -39,6 +39,7 @@ from .. import parts_of_speech
 from ..attrs import IOB_STRINGS
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
+from cython.operator cimport dereference as deref
 
 
 cdef class Token:
@@ -257,7 +258,7 @@ cdef class Token:
             # Check that the morph has the same vocab
             if self.vocab != morph.vocab:
                 raise ValueError(Errors.E1013)
-            self.c.morph = morph.c.key
+            self.c.morph = deref(morph.c).key
 
     def set_morph(self, features):
         cdef hash_t key
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 8f189d129c3..ce16f534219 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -402,7 +402,7 @@ coarse-grained POS as the feature `POS`.
 
 | Name        | Description                                            |
 | ----------- | ------------------------------------------------------ |
-| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
+| **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ |
 
 ## Morphologizer.label_data {id="label_data",tag="property",version="3"}
 

From bd401eaf4a5488c2ac830efe2085a4ae73535cc3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 29 Jul 2022 15:12:19 +0200
Subject: [PATCH 004/504] precompute_hiddens/Parser: look up CPU ops once (v4)
 (#11068)

* precompute_hiddens/Parser: look up CPU ops once

* precompute_hiddens: make cpu_ops private
---
 spacy/ml/parser_model.pyx            | 8 +++-----
 spacy/pipeline/transition_parser.pxd | 1 +
 spacy/pipeline/transition_parser.pyx | 8 ++------
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index f004c562e7d..cb323e98891 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -386,6 +386,7 @@ cdef class precompute_hiddens:
     cdef bint _is_synchronized
     cdef public object ops
     cdef public object numpy_ops
+    cdef public object _cpu_ops
     cdef np.ndarray _features
     cdef np.ndarray _cached
     cdef np.ndarray bias
@@ -416,6 +417,7 @@ cdef class precompute_hiddens:
         self.nO = cached.shape[2]
         self.ops = lower_model.ops
         self.numpy_ops = NumpyOps()
+        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
         assert activation in (None, "relu", "maxout")
         self.activation = activation
         self._is_synchronized = False
@@ -478,11 +480,7 @@ cdef class precompute_hiddens:
         # - Output from backward on GPU
         bp_hiddens = self._bp_hiddens
 
-        cdef CBlas cblas
-        if isinstance(self.ops, CupyOps):
-            cblas = NUMPY_OPS.cblas()
-        else:
-            cblas = self.ops.cblas()
+        cdef CBlas cblas = self._cpu_ops.cblas()
 
         feat_weights = self.get_feat_weights()
         cdef int[:, ::1] ids = token_ids
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index 7ddb91e0184..7ef20563b12 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -12,6 +12,7 @@ cdef class Parser(TrainablePipe):
     cdef public object _rehearsal_model
     cdef readonly TransitionSystem moves
     cdef public object _multitasks
+    cdef object _cpu_ops
 
     cdef void _parseC(
         self,
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 9a278fc1328..b8ebbf8ca88 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -135,6 +135,7 @@ cdef class Parser(TrainablePipe):
 
         self._rehearsal_model = None
         self.scorer = scorer
+        self._cpu_ops = get_ops("cpu") if isinstance(self.model.ops, CupyOps) else self.model.ops
 
     def __getnewargs_ex__(self):
         """This allows pickling the Parser and its keyword-only init arguments"""
@@ -273,12 +274,7 @@ cdef class Parser(TrainablePipe):
     def greedy_parse(self, docs, drop=0.):
         cdef vector[StateC*] states
         cdef StateClass state
-        ops = self.model.ops
-        cdef CBlas cblas
-        if isinstance(ops, CupyOps):
-            cblas = NUMPY_OPS.cblas()
-        else:
-            cblas = ops.cblas()
+        cdef CBlas cblas = self._cpu_ops.cblas()
         self._ensure_labels_are_added(docs)
         set_dropout_rate(self.model, drop)
         batch = self.moves.init_batch(docs)

From 8171ab393d1183af2df2aa0b0d80df32aab7412d Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 10 Aug 2022 11:44:05 +0200
Subject: [PATCH 005/504] Rename modules for consistency (#11286)

* rename Python module to entity_ruler

* rename Python module to attribute_ruler
---
 spacy/pipeline/__init__.py                               | 6 +++---
 spacy/pipeline/{attributeruler.py => attribute_ruler.py} | 0
 spacy/pipeline/{entityruler.py => entity_ruler.py}       | 0
 website/docs/api/attributeruler.mdx                      | 6 +++---
 website/docs/api/entityruler.mdx                         | 6 +++---
 website/docs/usage/saving-loading.mdx                    | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)
 rename spacy/pipeline/{attributeruler.py => attribute_ruler.py} (100%)
 rename spacy/pipeline/{entityruler.py => entity_ruler.py} (100%)

diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 2c4a5a8a87f..82d24486a27 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -1,9 +1,9 @@
-from .attributeruler import AttributeRuler
+from .attribute_ruler import AttributeRuler
 from .dep_parser import DependencyParser
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
-from .entityruler import EntityRuler
-from .functions import merge_entities, merge_noun_chunks, merge_subtokens
+from .ner import EntityRecognizer
+from .entity_ruler import EntityRuler
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .ner import EntityRecognizer
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attribute_ruler.py
similarity index 100%
rename from spacy/pipeline/attributeruler.py
rename to spacy/pipeline/attribute_ruler.py
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entity_ruler.py
similarity index 100%
rename from spacy/pipeline/entityruler.py
rename to spacy/pipeline/entity_ruler.py
diff --git a/website/docs/api/attributeruler.mdx b/website/docs/api/attributeruler.mdx
index c1831918752..e8cb248f85b 100644
--- a/website/docs/api/attributeruler.mdx
+++ b/website/docs/api/attributeruler.mdx
@@ -1,8 +1,8 @@
 ---
 title: AttributeRuler
 tag: class
-source: spacy/pipeline/attributeruler.py
-version: 3
+source: spacy/pipeline/attribute_ruler.py
+new: 3
 teaser: 'Pipeline component for rule-based token attribute assignment'
 api_string_name: attribute_ruler
 api_trainable: false
@@ -34,7 +34,7 @@ how the component should be configured. You can override its settings via the
 | `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
 
 ```python
-%%GITHUB_SPACY/spacy/pipeline/attributeruler.py
+%%GITHUB_SPACY/spacy/pipeline/attribute_ruler.py
 ```
 
 ## AttributeRuler.\_\_init\_\_ {id="init",tag="method"}
diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx
index 27624398ec6..a35b6e2566c 100644
--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@@ -1,8 +1,8 @@
 ---
 title: EntityRuler
 tag: class
-source: spacy/pipeline/entityruler.py
-version: 2.1
+source: spacy/pipeline/entity_ruler.py
+new: 2.1
 teaser: 'Pipeline component for rule-based named entity recognition'
 api_string_name: entity_ruler
 api_trainable: false
@@ -65,7 +65,7 @@ how the component should be configured. You can override its settings via the
 | `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                 |
 
 ```python
-%%GITHUB_SPACY/spacy/pipeline/entityruler.py
+%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py
 ```
 
 ## EntityRuler.\_\_init\_\_ {id="init",tag="method"}
diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx
index 9a6791d5e0a..b44bd86ed06 100644
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@@ -189,7 +189,7 @@ the data to and from a JSON file.
 >
 > To see custom serialization methods in action, check out the new
 > [`EntityRuler`](/api/entityruler) component and its
-> [source](%%GITHUB_SPACY/spacy/pipeline/entityruler.py). Patterns added to the
+> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the
 > component will be saved to a `.jsonl` file if the pipeline is serialized to
 > disk, and to a bytestring if the pipeline is serialized to bytes. This allows
 > saving out a pipeline with a rule-based entity recognizer and including all

From 09864c455cffbb721cfd79581dafe6b7d651b86d Mon Sep 17 00:00:00 2001
From: antonpibm <51074867+antonpibm@users.noreply.github.com>
Date: Thu, 11 Aug 2022 12:26:26 +0300
Subject: [PATCH 006/504] Match private networks as URLs (#11121)

---
 spacy/lang/tokenizer_exceptions.py | 4 ----
 spacy/tests/tokenizer/test_urls.py | 5 ++++-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index dbf9aab4912..a612ae8ac7e 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -16,10 +16,6 @@
     r"(?:\S+(?::\S*)?@)?"
     r"(?:"
     # IP address exclusion
-    # private & local networks
-    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
-    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
-    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
     # IP address dotted notation octets
     # excludes loopback network 0.0.0.0
     # excludes reserved space >= 224.0.0.0
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index ff8812be183..4753462a506 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -32,6 +32,9 @@
     "http://userid:password@example.com/",
     "http://142.42.1.1/",
     "http://142.42.1.1:8080/",
+    "http://10.140.12.13/foo",
+    "http://10.140.12.13/foo/bar?arg1=baz&arg2=taz",
+    "http://10.1.1.1",
     "http://foo.com/blah_(wikipedia)#cite-1",
     "http://foo.com/blah_(wikipedia)_blah#cite-1",
     "http://foo.com/unicode_(✪)_in_parens",
@@ -93,6 +96,7 @@
     "http://foo.bar/foo(bar)baz quux",
     "http://-error-.invalid/",
     "http://a.b-.co",
+    # Loopback and broadcast addresses should be excluded
     "http://0.0.0.0",
     "http://10.1.1.0",
     "http://10.1.1.255",
@@ -101,7 +105,6 @@
     "http://3628126748",
     "http://.www.foo.bar/",
     "http://.www.foo.bar./",
-    "http://10.1.1.1",
     "NASDAQ:GOOG",
     "http://-a.b.co",
     pytest.param("foo.com", marks=pytest.mark.xfail()),

From dbe67a1b0d852d7540ec9f6c7915872cffb79156 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 17 Aug 2022 12:13:54 +0200
Subject: [PATCH 007/504] Remove intify_attrs(_do_deprecated) (#11319)

---
 spacy/attrs.pyx                | 71 +---------------------------------
 spacy/tests/lang/test_attrs.py |  8 ----
 spacy/tokenizer.pyx            |  4 +-
 spacy/vocab.pyx                |  3 +-
 4 files changed, 4 insertions(+), 82 deletions(-)

diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 363dd094dcd..0a4aecc5d85 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -98,7 +98,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
 locals().update(IDS)
 
 
-def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
+def intify_attrs(stringy_attrs, strings_map=None):
     """
     Normalize a dictionary of attributes, converting them to ints.
 
@@ -110,75 +110,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
         converted to ints.
     """
     inty_attrs = {}
-    if _do_deprecated:
-        if "F" in stringy_attrs:
-            stringy_attrs["ORTH"] = stringy_attrs.pop("F")
-        if "L" in stringy_attrs:
-            stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
-        if "pos" in stringy_attrs:
-            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
-        if "morph" in stringy_attrs:
-            morphs = stringy_attrs.pop("morph")  # no-cython-lint
-        if "number" in stringy_attrs:
-            stringy_attrs.pop("number")
-        if "tenspect" in stringy_attrs:
-            stringy_attrs.pop("tenspect")
-        morph_keys = [
-            "PunctType",
-            "PunctSide",
-            "Other",
-            "Degree",
-            "AdvType",
-            "Number",
-            "VerbForm",
-            "PronType",
-            "Aspect",
-            "Tense",
-            "PartType",
-            "Poss",
-            "Hyph",
-            "ConjType",
-            "NumType",
-            "Foreign",
-            "VerbType",
-            "NounType",
-            "Gender",
-            "Mood",
-            "Negative",
-            "Tense",
-            "Voice",
-            "Abbr",
-            "Derivation",
-            "Echo",
-            "Foreign",
-            "NameType",
-            "NounType",
-            "NumForm",
-            "NumValue",
-            "PartType",
-            "Polite",
-            "StyleVariant",
-            "PronType",
-            "AdjType",
-            "Person",
-            "Variant",
-            "AdpType",
-            "Reflex",
-            "Negative",
-            "Mood",
-            "Aspect",
-            "Case",
-            "Polarity",
-            "PrepCase",
-            "Animacy",  # U20
-        ]
-        for key in morph_keys:
-            if key in stringy_attrs:
-                stringy_attrs.pop(key)
-            elif key.lower() in stringy_attrs:
-                stringy_attrs.pop(key.lower())
-            elif key.upper() in stringy_attrs:
-                stringy_attrs.pop(key.upper())
     for name, value in stringy_attrs.items():
         int_key = intify_attr(name)
         if int_key is not None:
diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py
index fd96e8f9bd4..0f52c3ed511 100644
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@@ -31,14 +31,6 @@ def test_attrs_idempotence(text):
     assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
 
 
-@pytest.mark.parametrize("text", ["dog"])
-def test_attrs_do_deprecated(text):
-    int_attrs = intify_attrs(
-        {"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True
-    )
-    assert int_attrs == {ORTH: 10, IS_ALPHA: True}
-
-
 def test_attrs_ent_iob_intify():
     int_attrs = intify_attrs({"ENT_IOB": ""})
     assert int_attrs == {ENT_IOB: 0}
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 6f2b10734c5..c95392a2026 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -579,7 +579,7 @@ cdef class Tokenizer:
         substrings (iterable): A sequence of dicts, where each dict describes
             a token and its attributes.
         """
-        attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings]
+        attrs = [intify_attrs(spec) for spec in substrings]
         orth = "".join([spec[ORTH] for spec in attrs])
         if chunk != orth:
             raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
@@ -647,7 +647,7 @@ cdef class Tokenizer:
             url_match = re.compile("a^").match
         special_cases = {}
         for orth, special_tokens in self.rules.items():
-            special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
+            special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings) for special_token in special_tokens]
         tokens = []
         for substring in text.split():
             suffixes = []
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 4004a70e034..c03226e2467 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -273,8 +273,7 @@ cdef class Vocab:
         cdef int i
         tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
         for i, props in enumerate(substrings):
-            props = intify_attrs(props, strings_map=self.strings,
-                                 _do_deprecated=True)
+            props = intify_attrs(props, strings_map=self.strings)
             token = &tokens[i]
             # Set the special tokens up to have arbitrary attributes
             lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])

From 2854f4d6cecdf63cdf1a3a9e5792995703712a70 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 22 Aug 2022 15:52:24 +0200
Subject: [PATCH 008/504] Cleanup Cython structs (#11337)

* cleanup Tokenizer fields

* remove unused object from vocab

* remove IS_OOV_DEPRECATED

* add back in as FLAG13

* FLAG 18 instead

* import fix

* fix clumpsy fingers

* revert symbol changes in favor of #11352

* bint instead of bool
---
 spacy/tokenizer.pxd |  6 +-----
 spacy/tokenizer.pyx | 11 +++++++++--
 spacy/vocab.pxd     |  1 -
 spacy/vocab.pyi     |  1 -
 spacy/vocab.pyx     |  7 ++-----
 5 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index a902ebad941..f64e0e93413 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -23,11 +23,7 @@ cdef class Tokenizer:
     cdef object _infix_finditer
     cdef object _rules
     cdef PhraseMatcher _special_matcher
-    # TODO convert to bool in v4
-    cdef int _faster_heuristics
-    # TODO next one is unused and should be removed in v4
-    # https://github.com/explosion/spaCy/pull/9150
-    cdef int _unused_int2
+    cdef bint _faster_heuristics
 
     cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
     cdef int _apply_special_cases(self, Doc doc) except -1
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index c95392a2026..9b79207f82e 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -8,11 +8,18 @@ from libcpp.set cimport set as stdset
 from preshed.maps cimport PreshMap
 
 import re
+
+from .tokens.doc cimport Doc
+from .strings cimport hash_string
 from .lexeme cimport EMPTY_LEXEME
 from .strings cimport hash_string
 from .tokens.doc cimport Doc
 
+from .attrs import intify_attrs
+from .symbols import ORTH, NORM
+from .errors import Errors
 from . import util
+from .util import get_words_and_spaces
 from .attrs import intify_attrs
 from .errors import Errors
 from .scorer import Scorer
@@ -124,10 +131,10 @@ cdef class Tokenizer:
 
     property faster_heuristics:
         def __get__(self):
-            return bool(self._faster_heuristics)
+            return self._faster_heuristics
 
         def __set__(self, faster_heuristics):
-            self._faster_heuristics = bool(faster_heuristics)
+            self._faster_heuristics = faster_heuristics
             self._reload_special_cases()
 
     def __reduce__(self):
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 43e47af1dee..b91ce3ab45b 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -32,7 +32,6 @@ cdef class Vocab:
     cdef public object writing_system
     cdef public object get_noun_chunks
     cdef readonly int length
-    cdef public object _unused_object  # TODO remove in v4, see #9150
     cdef public object lex_attr_getters
     cdef public object cfg
 
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
index b7ff20348a0..7f5f23e7847 100644
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@@ -73,7 +73,6 @@ def unpickle_vocab(
     sstore: StringStore,
     vectors: Any,
     morphology: Any,
-    _unused_object: Any,
     lex_attr_getters: Any,
     lookups: Any,
     get_noun_chunks: Any,
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index c03226e2467..834f21c35dc 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -579,21 +579,18 @@ def pickle_vocab(vocab):
     sstore = vocab.strings
     vectors = vocab.vectors
     morph = vocab.morphology
-    _unused_object = vocab._unused_object
     lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
     lookups = vocab.lookups
     get_noun_chunks = vocab.get_noun_chunks
     return (unpickle_vocab,
-            (sstore, vectors, morph, _unused_object, lex_attr_getters, lookups, get_noun_chunks))
+            (sstore, vectors, morph, lex_attr_getters, lookups, get_noun_chunks))
 
 
-def unpickle_vocab(sstore, vectors, morphology, _unused_object,
-                   lex_attr_getters, lookups, get_noun_chunks):
+def unpickle_vocab(sstore, vectors, morphology, lex_attr_getters, lookups, get_noun_chunks):
     cdef Vocab vocab = Vocab()
     vocab.vectors = vectors
     vocab.strings = sstore
     vocab.morphology = morphology
-    vocab._unused_object = _unused_object
     vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
     vocab.lookups = lookups
     vocab.get_noun_chunks = get_noun_chunks

From 185883e66c44d4b5d134371c556cbeace7f0ba29 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 22 Aug 2022 20:28:57 +0200
Subject: [PATCH 009/504] Make Span/Doc.ents more consistent for ent_kb_id and
 ent_id (#11328)

* Map `Span.id` to `Token.ent_id` in all cases when setting `Doc.ents`
* Reset `Token.ent_id` and `Token.ent_kb_id` when setting `Doc.ents`
* Make `Span.ent_id` an alias of `Span.id` rather than a read-only view
of the root token's `ent_id` annotation
---
 spacy/tests/doc/test_add_entities.py       |  27 ++++
 spacy/tests/doc/test_span.py               |  56 +++-----
 spacy/tokens/doc.pyx                       |  12 +-
 spacy/tokens/span.pyi                      |  24 ++--
 spacy/tokens/span.pyx                      |  35 ++---
 website/docs/api/span.mdx                  |  46 +++----
 website/docs/api/token.mdx                 | 144 ++++++++++-----------
 website/docs/usage/rule-based-matching.mdx |   6 +-
 8 files changed, 186 insertions(+), 164 deletions(-)

diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 259b21fb3dd..586b8a745f6 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -46,6 +46,33 @@ def test_ents_reset(en_vocab):
     assert [t.ent_iob_ for t in doc] == orig_iobs
 
 
+def test_ents_clear(en_vocab):
+    """Ensure that removing entities clears token attributes"""
+    text = ["Louisiana", "Office", "of", "Conservation"]
+    doc = Doc(en_vocab, words=text)
+    entity = Span(doc, 0, 4, label=391, span_id="TEST")
+    doc.ents = [entity]
+    doc.ents = []
+    for token in doc:
+        assert token.ent_iob == 2
+        assert token.ent_type == 0
+        assert token.ent_id == 0
+        assert token.ent_kb_id == 0
+    doc.ents = [entity]
+    doc.set_ents([], default="missing")
+    for token in doc:
+        assert token.ent_iob == 0
+        assert token.ent_type == 0
+        assert token.ent_id == 0
+        assert token.ent_kb_id == 0
+    doc.set_ents([], default="blocked")
+    for token in doc:
+        assert token.ent_iob == 3
+        assert token.ent_type == 0
+        assert token.ent_id == 0
+        assert token.ent_kb_id == 0
+
+
 def test_add_overlapping_entities(en_vocab):
     text = ["Louisiana", "Office", "of", "Conservation"]
     doc = Doc(en_vocab, words=text)
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index e5c71dafcf7..ab8538b17dc 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -703,41 +703,21 @@ def test_span_group_copy(doc):
     assert len(doc_copy.spans["test"]) == 2
 
 
-def test_for_partial_ent_sents():
-    """Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
-    which this tests for.
-    """
-    doc = Doc(
-        English().vocab,
-        words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
-        sent_starts=[1, 0, 0, 1, 0, 0],
-    )
-    doc.set_ents([Span(doc, 1, 4, "WORK")])
-    # The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
-    # equal to the sentences referenced in ent.sents.
-    for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
-        assert doc_sent == ent_sent
-
-
-def test_for_no_ent_sents():
-    """Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
-    sentence.
-    """
-    doc = Doc(
-        English().vocab,
-        words=["This", "is", "a", "test.", "ENTITY"],
-        sent_starts=[1, 0, 0, 0, 1],
-    )
-    doc.set_ents([Span(doc, 4, 5, "WORK")])
-    sents = list(doc.ents[0].sents)
-    assert len(sents) == 1
-    assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
-
-
-def test_span_api_richcmp_other(en_tokenizer):
-    doc1 = en_tokenizer("a b")
-    doc2 = en_tokenizer("b c")
-    assert not doc1[1:2] == doc1[1]
-    assert not doc1[1:2] == doc2[0]
-    assert not doc1[1:2] == doc2[0:1]
-    assert not doc1[0:1] == doc2
+@pytest.mark.issue(11113)
+def test_span_ent_id(en_tokenizer):
+    doc = en_tokenizer("a b c d")
+    doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")]
+    span = doc.ents[0]
+    assert doc[1].ent_id_ == "ID0"
+
+    # setting Span.id sets Token.ent_id
+    span.id_ = "ID1"
+    doc.ents = [span]
+    assert doc.ents[0].ent_id_ == "ID1"
+    assert doc[1].ent_id_ == "ID1"
+
+    # Span.ent_id is an alias of Span.id
+    span.ent_id_ = "ID2"
+    doc.ents = [span]
+    assert doc.ents[0].ent_id_ == "ID2"
+    assert doc[1].ent_id_ == "ID2"
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 181c0ce0fce..50fc6e536c2 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -841,27 +841,33 @@ cdef class Doc:
                     self.c[i].ent_iob = 1
                 self.c[i].ent_type = span.label
                 self.c[i].ent_kb_id = span.kb_id
-                # for backwards compatibility in v3, only set ent_id from
-                # span.id if it's set, otherwise don't override
-                self.c[i].ent_id = span.id if span.id else self.c[i].ent_id
+                self.c[i].ent_id = span.id
         for span in blocked:
             for i in range(span.start, span.end):
                 self.c[i].ent_iob = 3
                 self.c[i].ent_type = 0
+                self.c[i].ent_kb_id = 0
+                self.c[i].ent_id = 0
         for span in missing:
             for i in range(span.start, span.end):
                 self.c[i].ent_iob = 0
                 self.c[i].ent_type = 0
+                self.c[i].ent_kb_id = 0
+                self.c[i].ent_id = 0
         for span in outside:
             for i in range(span.start, span.end):
                 self.c[i].ent_iob = 2
                 self.c[i].ent_type = 0
+                self.c[i].ent_kb_id = 0
+                self.c[i].ent_id = 0
 
         # Set tokens outside of all provided spans
         if default != SetEntsDefault.unmodified:
             for i in range(self.length):
                 if i not in seen_tokens:
                     self.c[i].ent_type = 0
+                    self.c[i].ent_kb_id = 0
+                    self.c[i].ent_id = 0
                     if default == SetEntsDefault.outside:
                         self.c[i].ent_iob = 2
                     elif default == SetEntsDefault.missing:
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index b982eb810b8..a6731d1c2d4 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -125,15 +125,23 @@ class Span:
     end: int
     start_char: int
     end_char: int
-    label: int
-    kb_id: int
-    id: int
-    ent_id: int
-    ent_id_: str
+    @property
+    def label(self) -> int: ...
+    @property
+    def kb_id(self) -> int: ...
+    @property
+    def id(self) -> int: ...
+    @property
+    def ent_id(self) -> int: ...
     @property
     def orth_(self) -> str: ...
     @property
     def lemma_(self) -> str: ...
-    label_: str
-    kb_id_: str
-    id_: str
+    @property
+    def label_(self) -> str: ...
+    @property
+    def kb_id_(self) -> str: ...
+    @property
+    def id_(self) -> str: ...
+    @property
+    def ent_id_(self) -> str: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 17c4c4c6059..b212b4c4303 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -836,26 +836,18 @@ cdef class Span:
 
     property id:
         def __get__(self):
-            return self.c.id
+            return self.span_c().id
 
         def __set__(self, attr_t id):
-            self.c.id = id
+            self.span_c().id = id
 
     property ent_id:
-        """RETURNS (uint64): The entity ID."""
+        """Alias for the span's ID."""
         def __get__(self):
-            return self.root.ent_id
+            return self.id
 
-        def __set__(self, hash_t key):
-            raise NotImplementedError(Errors.E200.format(attr="ent_id"))
-
-    property ent_id_:
-        """RETURNS (str): The (string) entity ID."""
-        def __get__(self):
-            return self.root.ent_id_
-
-        def __set__(self, str key):
-            raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
+        def __set__(self, attr_t ent_id):
+            self.id = ent_id
 
     @property
     def orth_(self):
@@ -871,7 +863,7 @@ cdef class Span:
         return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
 
     property label_:
-        """RETURNS (str): The span's label."""
+        """The span's label."""
         def __get__(self):
             return self.doc.vocab.strings[self.label]
 
@@ -879,7 +871,7 @@ cdef class Span:
             self.label = self.doc.vocab.strings.add(label_)
 
     property kb_id_:
-        """RETURNS (str): The span's KB ID."""
+        """The span's KB ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.kb_id]
 
@@ -887,13 +879,22 @@ cdef class Span:
             self.kb_id = self.doc.vocab.strings.add(kb_id_)
 
     property id_:
-        """RETURNS (str): The span's ID."""
+        """The span's ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.id]
 
         def __set__(self, str id_):
             self.id = self.doc.vocab.strings.add(id_)
 
+    property ent_id_:
+        """Alias for the span's ID."""
+        def __get__(self):
+            return self.id_
+
+        def __set__(self, str ent_id_):
+            self.id_ = ent_id_
+
+
 
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index 41422a5b4e1..5e7495f17ca 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -547,26 +547,26 @@ overlaps with will be returned.
 
 ## Attributes {id="attributes"}
 
-| Name           | Description                                                                                                                   |
-| -------------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| `doc`          | The parent document. ~~Doc~~                                                                                                  |
-| `tensor`       | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                              |
-| `start`        | The token offset for the start of the span. ~~int~~                                                                           |
-| `end`          | The token offset for the end of the span. ~~int~~                                                                             |
-| `start_char`   | The character offset for the start of the span. ~~int~~                                                                       |
-| `end_char`     | The character offset for the end of the span. ~~int~~                                                                         |
-| `text`         | A string representation of the span text. ~~str~~                                                                             |
-| `text_with_ws` | The text content of the span with a trailing whitespace character if the last token has one. ~~str~~                          |
-| `orth`         | ID of the verbatim text content. ~~int~~                                                                                      |
-| `orth_`        | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. ~~str~~            |
-| `label`        | The hash value of the span's label. ~~int~~                                                                                   |
-| `label_`       | The span's label. ~~str~~                                                                                                     |
-| `lemma_`       | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~                                      |
-| `kb_id`        | The hash value of the knowledge base ID referred to by the span. ~~int~~                                                      |
-| `kb_id_`       | The knowledge base ID referred to by the span. ~~str~~                                                                        |
-| `ent_id`       | The hash value of the named entity the root token is an instance of. ~~int~~                                                  |
-| `ent_id_`      | The string ID of the named entity the root token is an instance of. ~~str~~                                                   |
-| `id`           | The hash value of the span's ID. ~~int~~                                                                                      |
-| `id_`          | The span's ID. ~~str~~                                                                                                        |
-| `sentiment`    | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
-| `_`            | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
+| Name                                    | Description                                                                                                                   |
+| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| `doc`                                   | The parent document. ~~Doc~~                                                                                                  |
+| `tensor` <Tag variant="new">2.1.7</Tag> | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                              |
+| `start`                                 | The token offset for the start of the span. ~~int~~                                                                           |
+| `end`                                   | The token offset for the end of the span. ~~int~~                                                                             |
+| `start_char`                            | The character offset for the start of the span. ~~int~~                                                                       |
+| `end_char`                              | The character offset for the end of the span. ~~int~~                                                                         |
+| `text`                                  | A string representation of the span text. ~~str~~                                                                             |
+| `text_with_ws`                          | The text content of the span with a trailing whitespace character if the last token has one. ~~str~~                          |
+| `orth`                                  | ID of the verbatim text content. ~~int~~                                                                                      |
+| `orth_`                                 | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. ~~str~~            |
+| `label`                                 | The hash value of the span's label. ~~int~~                                                                                   |
+| `label_`                                | The span's label. ~~str~~                                                                                                     |
+| `lemma_`                                | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~                                      |
+| `kb_id`                                 | The hash value of the knowledge base ID referred to by the span. ~~int~~                                                      |
+| `kb_id_`                                | The knowledge base ID referred to by the span. ~~str~~                                                                        |
+| `ent_id`                                | Alias for `id`: the hash value of the span's ID. ~~int~~                                                                      |
+| `ent_id_`                               | Alias for `id_`: the span's ID. ~~str~~                                                                                       |
+| `id`                                    | The hash value of the span's ID. ~~int~~                                                                                      |
+| `id_`                                   | The span's ID. ~~str~~                                                                                                        |
+| `sentiment`                             | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
+| `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
diff --git a/website/docs/api/token.mdx b/website/docs/api/token.mdx
index 63ee1080bf1..12b99394350 100644
--- a/website/docs/api/token.mdx
+++ b/website/docs/api/token.mdx
@@ -403,75 +403,75 @@ The L2 norm of the token's vector representation.
 
 ## Attributes {id="attributes"}
 
-| Name                               | Description                                                                                                                                                                                                                                                          |
-| ---------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `doc`                              | The parent document. ~~Doc~~                                                                                                                                                                                                                                         |
-| `lex` <Tag variant="new">3</Tag>   | The underlying lexeme. ~~Lexeme~~                                                                                                                                                                                                                                    |
-| `sent`                             | The sentence span that this token is a part of. ~~Span~~                                                                                                                                                                                                             |
-| `text`                             | Verbatim text content. ~~str~~                                                                                                                                                                                                                                       |
-| `text_with_ws`                     | Text content, with trailing space character if present. ~~str~~                                                                                                                                                                                                      |
-| `whitespace_`                      | Trailing space character if present. ~~str~~                                                                                                                                                                                                                         |
-| `orth`                             | ID of the verbatim text content. ~~int~~                                                                                                                                                                                                                             |
-| `orth_`                            | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~                                                                                                                                                  |
-| `vocab`                            | The vocab object of the parent `Doc`. ~~vocab~~                                                                                                                                                                                                                      |
-| `tensor`                           | The token's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                                                                                                                                                                    |
-| `head`                             | The syntactic parent, or "governor", of this token. ~~Token~~                                                                                                                                                                                                        |
-| `left_edge`                        | The leftmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                  |
-| `right_edge`                       | The rightmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                 |
-| `i`                                | The index of the token within the parent document. ~~int~~                                                                                                                                                                                                           |
-| `ent_type`                         | Named entity type. ~~int~~                                                                                                                                                                                                                                           |
-| `ent_type_`                        | Named entity type. ~~str~~                                                                                                                                                                                                                                           |
-| `ent_iob`                          | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~                                                                                 |
-| `ent_iob_`                         | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~                                                                                  |
-| `ent_kb_id`                        | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~                                                                                                                                                                           |
-| `ent_kb_id_`                       | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~                                                                                                                                                                           |
-| `ent_id`                           | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~                                                                                                                                        |
-| `ent_id_`                          | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~                                                                                                                                        |
-| `lemma`                            | Base form of the token, with no inflectional suffixes. ~~int~~                                                                                                                                                                                                       |
-| `lemma_`                           | Base form of the token, with no inflectional suffixes. ~~str~~                                                                                                                                                                                                       |
-| `norm`                             | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~                                                                                                   |
-| `norm_`                            | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~                                                                                                   |
-| `lower`                            | Lowercase form of the token. ~~int~~                                                                                                                                                                                                                                 |
-| `lower_`                           | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~                                                                                                                                                                                        |
-| `shape`                            | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
-| `shape_`                           | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
-| `prefix`                           | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~                                                                                                                                                                           |
-| `prefix_`                          | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~                                                                                                                                                                                         |
-| `suffix`                           | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~                                                                                                                                                                             |
-| `suffix_`                          | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~                                                                                                                                                                                             |
-| `is_alpha`                         | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~                                                                                                                                                                      |
-| `is_ascii`                         | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~                                                                                                                                                          |
-| `is_digit`                         | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~                                                                                                                                                                                     |
-| `is_lower`                         | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~                                                                                                                                                                                            |
-| `is_upper`                         | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~                                                                                                                                                                                            |
-| `is_title`                         | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~                                                                                                                                                                                            |
-| `is_punct`                         | Is the token punctuation? ~~bool~~                                                                                                                                                                                                                                   |
-| `is_left_punct`                    | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~                                                                                                                                                                                                          |
-| `is_right_punct`                   | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~                                                                                                                                                                                                         |
-| `is_sent_start`                    | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`.                                                                                                                                                 |
-| `is_sent_end`                      | Does the token end a sentence? ~~bool~~ or `None` if unknown.                                                                                                                                                                                                        |
-| `is_space`                         | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~                                                                                                                                                                      |
-| `is_bracket`                       | Is the token a bracket? ~~bool~~                                                                                                                                                                                                                                     |
-| `is_quote`                         | Is the token a quotation mark? ~~bool~~                                                                                                                                                                                                                              |
-| `is_currency`                      | Is the token a currency symbol? ~~bool~~                                                                                                                                                                                                                             |
-| `like_url`                         | Does the token resemble a URL? ~~bool~~                                                                                                                                                                                                                              |
-| `like_num`                         | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~                                                                                                                                                                                           |
-| `like_email`                       | Does the token resemble an email address? ~~bool~~                                                                                                                                                                                                                   |
-| `is_oov`                           | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~                                                                                                                                                                                       |
-| `is_stop`                          | Is the token part of a "stop list"? ~~bool~~                                                                                                                                                                                                                         |
-| `pos`                              | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~                                                                                                                                                    |
-| `pos_`                             | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~                                                                                                                                                    |
-| `tag`                              | Fine-grained part-of-speech. ~~int~~                                                                                                                                                                                                                                 |
-| `tag_`                             | Fine-grained part-of-speech. ~~str~~                                                                                                                                                                                                                                 |
-| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~                                                                                                                                                                                                                            |
-| `dep`                              | Syntactic dependency relation. ~~int~~                                                                                                                                                                                                                               |
-| `dep_`                             | Syntactic dependency relation. ~~str~~                                                                                                                                                                                                                               |
-| `lang`                             | Language of the parent document's vocabulary. ~~int~~                                                                                                                                                                                                                |
-| `lang_`                            | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
-| `prob`                             | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
-| `idx`                              | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
-| `sentiment`                        | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
-| `lex_id`                           | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
-| `rank`                             | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
-| `cluster`                          | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
-| `_`                                | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                                                                                                                                        |
+| Name                                         | Description                                                                                                                                                                                                                                                          |
+| -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `doc`                                        | The parent document. ~~Doc~~                                                                                                                                                                                                                                         |
+| `lex` <Tag variant="new">3</Tag>             | The underlying lexeme. ~~Lexeme~~                                                                                                                                                                                                                                    |
+| `sent` <Tag variant="new">2.0.12</Tag>       | The sentence span that this token is a part of. ~~Span~~                                                                                                                                                                                                             |
+| `text`                                       | Verbatim text content. ~~str~~                                                                                                                                                                                                                                       |
+| `text_with_ws`                               | Text content, with trailing space character if present. ~~str~~                                                                                                                                                                                                      |
+| `whitespace_`                                | Trailing space character if present. ~~str~~                                                                                                                                                                                                                         |
+| `orth`                                       | ID of the verbatim text content. ~~int~~                                                                                                                                                                                                                             |
+| `orth_`                                      | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~                                                                                                                                                  |
+| `vocab`                                      | The vocab object of the parent `Doc`. ~~vocab~~                                                                                                                                                                                                                      |
+| `tensor` <Tag variant="new">2.1.7</Tag>      | The token's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                                                                                                                                                                    |
+| `head`                                       | The syntactic parent, or "governor", of this token. ~~Token~~                                                                                                                                                                                                        |
+| `left_edge`                                  | The leftmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                  |
+| `right_edge`                                 | The rightmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                 |
+| `i`                                          | The index of the token within the parent document. ~~int~~                                                                                                                                                                                                           |
+| `ent_type`                                   | Named entity type. ~~int~~                                                                                                                                                                                                                                           |
+| `ent_type_`                                  | Named entity type. ~~str~~                                                                                                                                                                                                                                           |
+| `ent_iob`                                    | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~                                                                                 |
+| `ent_iob_`                                   | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~                                                                                  |
+| `ent_kb_id` <Tag variant="new">2.2</Tag>     | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~                                                                                                                                                                           |
+| `ent_kb_id_` <Tag variant="new">2.2</Tag>    | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~                                                                                                                                                                           |
+| `ent_id`                                     | ID of the entity the token is an instance of, if any. ~~int~~                                                                                                                                                                                                        |
+| `ent_id_`                                    | ID of the entity the token is an instance of, if any. ~~str~~                                                                                                                                                                                                        |
+| `lemma`                                      | Base form of the token, with no inflectional suffixes. ~~int~~                                                                                                                                                                                                       |
+| `lemma_`                                     | Base form of the token, with no inflectional suffixes. ~~str~~                                                                                                                                                                                                       |
+| `norm`                                       | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~                                                                                                   |
+| `norm_`                                      | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~                                                                                                   |
+| `lower`                                      | Lowercase form of the token. ~~int~~                                                                                                                                                                                                                                 |
+| `lower_`                                     | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~                                                                                                                                                                                        |
+| `shape`                                      | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
+| `shape_`                                     | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
+| `prefix`                                     | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~                                                                                                                                                                           |
+| `prefix_`                                    | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~                                                                                                                                                                                         |
+| `suffix`                                     | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~                                                                                                                                                                             |
+| `suffix_`                                    | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~                                                                                                                                                                                             |
+| `is_alpha`                                   | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~                                                                                                                                                                      |
+| `is_ascii`                                   | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~                                                                                                                                                          |
+| `is_digit`                                   | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~                                                                                                                                                                                     |
+| `is_lower`                                   | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~                                                                                                                                                                                            |
+| `is_upper`                                   | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~                                                                                                                                                                                            |
+| `is_title`                                   | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~                                                                                                                                                                                            |
+| `is_punct`                                   | Is the token punctuation? ~~bool~~                                                                                                                                                                                                                                   |
+| `is_left_punct`                              | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~                                                                                                                                                                                                          |
+| `is_right_punct`                             | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~                                                                                                                                                                                                         |
+| `is_sent_start`                              | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`.                                                                                                                                                 |
+| `is_sent_end`                                | Does the token end a sentence? ~~bool~~ or `None` if unknown.                                                                                                                                                                                                        |
+| `is_space`                                   | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~                                                                                                                                                                      |
+| `is_bracket`                                 | Is the token a bracket? ~~bool~~                                                                                                                                                                                                                                     |
+| `is_quote`                                   | Is the token a quotation mark? ~~bool~~                                                                                                                                                                                                                              |
+| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the token a currency symbol? ~~bool~~                                                                                                                                                                                                                             |
+| `like_url`                                   | Does the token resemble a URL? ~~bool~~                                                                                                                                                                                                                              |
+| `like_num`                                   | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~                                                                                                                                                                                           |
+| `like_email`                                 | Does the token resemble an email address? ~~bool~~                                                                                                                                                                                                                   |
+| `is_oov`                                     | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~                                                                                                                                                                                       |
+| `is_stop`                                    | Is the token part of a "stop list"? ~~bool~~                                                                                                                                                                                                                         |
+| `pos`                                        | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~                                                                                                                                                    |
+| `pos_`                                       | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~                                                                                                                                                    |
+| `tag`                                        | Fine-grained part-of-speech. ~~int~~                                                                                                                                                                                                                                 |
+| `tag_`                                       | Fine-grained part-of-speech. ~~str~~                                                                                                                                                                                                                                 |
+| `morph` <Tag variant="new">3</Tag>           | Morphological analysis. ~~MorphAnalysis~~                                                                                                                                                                                                                            |
+| `dep`                                        | Syntactic dependency relation. ~~int~~                                                                                                                                                                                                                               |
+| `dep_`                                       | Syntactic dependency relation. ~~str~~                                                                                                                                                                                                                               |
+| `lang`                                       | Language of the parent document's vocabulary. ~~int~~                                                                                                                                                                                                                |
+| `lang_`                                      | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
+| `prob`                                       | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
+| `idx`                                        | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
+| `sentiment`                                  | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
+| `lex_id`                                     | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
+| `rank`                                       | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
+| `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
+| `_`                                          | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                                                                                                                                        |
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index e5b98da3a8c..c90172b4325 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1399,14 +1399,14 @@ patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
 ruler.add_patterns(patterns)
 
 doc1 = nlp("Apple is opening its first big office in San Francisco.")
-print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
+print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents])
 
 doc2 = nlp("Apple is opening its first big office in San Fran.")
-print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
+print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
 ```
 
 If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
-patterns, the `ent_id_` property of the matched entity is set to the `id` given
+patterns, the `id_` property of the matched entity is set to the `id` given
 in the patterns. So in the example above it's easy to identify that "San
 Francisco" and "San Fran" are both the same entity.
 

From e8cdfafb367c9bfaa53da74a8abab98332ec1285 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 26 Aug 2022 10:11:18 +0200
Subject: [PATCH 010/504] Switch to mecab-ko as default Korean tokenizer
 (#11294)

* Switch to mecab-ko as default Korean tokenizer

Switch to the (confusingly-named) mecab-ko python module for default Korean
tokenization.

Maintain the previous `natto-py` tokenizer as
`spacy.KoreanNattoTokenizer.v1`.

* Temporarily run tests with mecab-ko tokenizer

* Fix types

* Fix duplicate test names

* Update requirements test

* Revert "Temporarily run tests with mecab-ko tokenizer"

This reverts commit d2083e7044403a2046f902b125a147525b703e29.

* Add mecab_args setting, fix pickle for KoreanNattoTokenizer

* Fix length check

* Update docs

* Formatting

* Update natto-py error message

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
---
 setup.cfg                                 |   2 +-
 spacy/lang/ko/__init__.py                 | 121 +++++++++++++++++-----
 spacy/tests/conftest.py                   |  16 ++-
 spacy/tests/lang/ko/test_lemmatization.py |   8 ++
 spacy/tests/lang/ko/test_serialize.py     |  20 ++++
 spacy/tests/lang/ko/test_tokenizer.py     |  42 +++++++-
 spacy/tests/package/test_requirements.py  |   2 +-
 website/docs/usage/models.mdx             |  35 ++++++-
 8 files changed, 212 insertions(+), 34 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 5e8e99f8784..887486f12d6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -123,7 +123,7 @@ ja =
     sudachipy>=0.5.2,!=0.6.1
     sudachidict_core>=20211220
 ko =
-    natto-py>=0.9.0
+    mecab-ko>=1.0.0
 th =
     pythainlp>=2.0
 
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index e2c860f7de9..81052cb24aa 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -17,34 +17,23 @@
 
 [nlp.tokenizer]
 @tokenizers = "spacy.ko.KoreanTokenizer"
+mecab_args = ""
 """
 
 
 @registry.tokenizers("spacy.ko.KoreanTokenizer")
-def create_tokenizer():
+def create_tokenizer(mecab_args: str):
     def korean_tokenizer_factory(nlp):
-        return KoreanTokenizer(nlp.vocab)
+        return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args)
 
     return korean_tokenizer_factory
 
 
 class KoreanTokenizer(DummyTokenizer):
-    def __init__(self, vocab: Vocab):
+    def __init__(self, vocab: Vocab, *, mecab_args: str = ""):
         self.vocab = vocab
-        self._mecab = try_mecab_import()  # type: ignore[func-returns-value]
-        self._mecab_tokenizer = None
-
-    @property
-    def mecab_tokenizer(self):
-        # This is a property so that initializing a pipeline with blank:ko is
-        # possible without actually requiring mecab-ko, e.g. to run
-        # `spacy init vectors ko` for a pipeline that will have a different
-        # tokenizer in the end. The languages need to match for the vectors
-        # to be imported and there's no way to pass a custom config to
-        # `init vectors`.
-        if self._mecab_tokenizer is None:
-            self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
-        return self._mecab_tokenizer
+        mecab = try_mecab_import()
+        self.mecab_tokenizer = mecab.Tagger(mecab_args)
 
     def __reduce__(self):
         return KoreanTokenizer, (self.vocab,)
@@ -67,13 +56,15 @@ def __call__(self, text: str) -> Doc:
     def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
         # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
         # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
-        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
-            if node.is_eos():
+        for line in self.mecab_tokenizer.parse(text).split("\n"):
+            if line == "EOS":
                 break
-            surface = node.surface
-            feature = node.feature
-            tag, _, expr = feature.partition(",")
-            lemma, _, remainder = expr.partition("/")
+            surface, _, expr = line.partition("\t")
+            features = expr.split("/")[0].split(",")
+            tag = features[0]
+            lemma = "*"
+            if len(features) >= 8:
+                lemma = features[7]
             if lemma == "*":
                 lemma = surface
             yield {"surface": surface, "lemma": lemma, "tag": tag}
@@ -96,20 +87,94 @@ class Korean(Language):
     Defaults = KoreanDefaults
 
 
-def try_mecab_import() -> None:
+def try_mecab_import():
     try:
-        from natto import MeCab
+        import mecab_ko as MeCab
 
         return MeCab
     except ImportError:
         raise ImportError(
             'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
-            "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
-            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
-            "and [natto-py](https://github.com/buruzaemon/natto-py)"
+            "the python package `mecab-ko`: pip install mecab-ko"
         ) from None
 
 
+@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
+def create_natto_tokenizer():
+    def korean_natto_tokenizer_factory(nlp):
+        return KoreanNattoTokenizer(nlp.vocab)
+
+    return korean_natto_tokenizer_factory
+
+
+class KoreanNattoTokenizer(DummyTokenizer):
+    def __init__(self, vocab: Vocab):
+        self.vocab = vocab
+        self._mecab = self._try_mecab_import()  # type: ignore[func-returns-value]
+        self._mecab_tokenizer = None
+
+    @property
+    def mecab_tokenizer(self):
+        # This is a property so that initializing a pipeline with blank:ko is
+        # possible without actually requiring mecab-ko, e.g. to run
+        # `spacy init vectors ko` for a pipeline that will have a different
+        # tokenizer in the end. The languages need to match for the vectors
+        # to be imported and there's no way to pass a custom config to
+        # `init vectors`.
+        if self._mecab_tokenizer is None:
+            self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
+        return self._mecab_tokenizer
+
+    def __reduce__(self):
+        return KoreanNattoTokenizer, (self.vocab,)
+
+    def __call__(self, text: str) -> Doc:
+        dtokens = list(self.detailed_tokens(text))
+        surfaces = [dt["surface"] for dt in dtokens]
+        doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
+        for token, dtoken in zip(doc, dtokens):
+            first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
+            token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
+            if token.tag_ in TAG_MAP:
+                token.pos = TAG_MAP[token.tag_][POS]
+            else:
+                token.pos = X
+            token.lemma_ = dtoken["lemma"]
+        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
+        return doc
+
+    def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
+        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],      종성 유무(jongseong)[2], 읽기(reading)[3],
+        # 타입(type)[4], 첫번째 품사(start pos)[5],     마지막 품사(end pos)[6], 표현(expression)[7], *
+        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
+            if node.is_eos():
+                break
+            surface = node.surface
+            feature = node.feature
+            tag, _, expr = feature.partition(",")
+            lemma, _, remainder = expr.partition("/")
+            if lemma == "*" or lemma == "":
+                lemma = surface
+            yield {"surface": surface, "lemma": lemma, "tag": tag}
+
+    def score(self, examples):
+        validate_examples(examples, "KoreanTokenizer.score")
+        return Scorer.score_tokenization(examples)
+
+    def _try_mecab_import(self):
+        try:
+            from natto import MeCab
+
+            return MeCab
+        except ImportError:
+            raise ImportError(
+                'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires '
+                "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
+                "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
+                "and [natto-py](https://github.com/buruzaemon/natto-py)"
+            ) from None
+
+
 def check_spaces(text, tokens):
     prev_end = -1
     start = 0
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 7db986ab9e7..2a9f441c9b0 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -245,7 +245,7 @@ def hsb_tokenizer():
 
 @pytest.fixture(scope="session")
 def ko_tokenizer():
-    pytest.importorskip("natto")
+    pytest.importorskip("mecab_ko")
     return get_lang_class("ko")().tokenizer
 
 
@@ -267,6 +267,20 @@ def la_tokenizer():
     return get_lang_class("la")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def ko_tokenizer_natto():
+    pytest.importorskip("natto")
+    config = {
+        "nlp": {
+            "tokenizer": {
+                "@tokenizers": "spacy.KoreanNattoTokenizer.v1",
+            }
+        }
+    }
+    nlp = get_lang_class("ko").from_config(config)
+    return nlp.tokenizer
+
+
 @pytest.fixture(scope="session")
 def lb_tokenizer():
     return get_lang_class("lb")().tokenizer
diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py
index 7782ca4bcab..0c389b9ce52 100644
--- a/spacy/tests/lang/ko/test_lemmatization.py
+++ b/spacy/tests/lang/ko/test_lemmatization.py
@@ -7,3 +7,11 @@
 def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
     test_lemma = ko_tokenizer(word)[0].lemma_
     assert test_lemma == lemma
+
+
+@pytest.mark.parametrize(
+    "word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
+)
+def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
+    test_lemma = ko_tokenizer_natto(word)[0].lemma_
+    assert test_lemma == lemma
diff --git a/spacy/tests/lang/ko/test_serialize.py b/spacy/tests/lang/ko/test_serialize.py
index bba7bce6e05..eecc7d955ba 100644
--- a/spacy/tests/lang/ko/test_serialize.py
+++ b/spacy/tests/lang/ko/test_serialize.py
@@ -23,3 +23,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer):
     b = pickle.dumps(ko_tokenizer)
     ko_tokenizer_re = pickle.loads(b)
     assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
+
+
+def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
+    tokenizer_bytes = ko_tokenizer_natto.to_bytes()
+    nlp = Korean()
+    nlp.tokenizer.from_bytes(tokenizer_bytes)
+    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+    with make_tempdir() as d:
+        file_path = d / "tokenizer"
+        ko_tokenizer_natto.to_disk(file_path)
+        nlp = Korean()
+        nlp.tokenizer.from_disk(file_path)
+        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+
+def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
+    b = pickle.dumps(ko_tokenizer_natto)
+    ko_tokenizer_natto_re = pickle.loads(b)
+    assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()
diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py
index 6e06e405e0b..e7f8a5c0d79 100644
--- a/spacy/tests/lang/ko/test_tokenizer.py
+++ b/spacy/tests/lang/ko/test_tokenizer.py
@@ -19,6 +19,8 @@
               "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
 # fmt: on
 
+# tests for ko_tokenizer (default KoreanTokenizer)
+
 
 @pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
 def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
@@ -44,7 +46,7 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
     assert pos == expected_pos.split()
 
 
-def test_ko_empty_doc(ko_tokenizer):
+def test_ko_tokenizer_empty_doc(ko_tokenizer):
     tokens = ko_tokenizer("")
     assert len(tokens) == 0
 
@@ -55,6 +57,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer):
     assert tokens[1].pos_ == "X"
 
 
+# same tests for ko_tokenizer_natto (KoreanNattoTokenizer)
+
+
+@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
+def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens):
+    tokens = [token.text for token in ko_tokenizer_natto(text)]
+    assert tokens == expected_tokens.split()
+
+
+@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
+def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags):
+    tags = [token.tag_ for token in ko_tokenizer_natto(text)]
+    assert tags == expected_tags.split()
+
+
+@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
+def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags):
+    tags = ko_tokenizer_natto(text).user_data["full_tags"]
+    assert tags == expected_tags.split()
+
+
+@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
+def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos):
+    pos = [token.pos_ for token in ko_tokenizer_natto(text)]
+    assert pos == expected_pos.split()
+
+
+def test_ko_tokenizer_natto_empty_doc(ko_tokenizer_natto):
+    tokens = ko_tokenizer_natto("")
+    assert len(tokens) == 0
+
+
+@pytest.mark.issue(10535)
+def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto):
+    tokens = ko_tokenizer_natto("미닛 리피터")
+    assert tokens[1].pos_ == "X"
+
+
 # fmt: off
 SPACY_TOKENIZER_TESTS = [
     ("있다.", "있다 ."),
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index ff07c5b454a..704d4b90b44 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -25,7 +25,7 @@ def test_build_dependencies():
     libs_ignore_setup = [
         "numpy",
         "fugashi",
-        "natto-py",
+        "mecab-ko",
         "pythainlp",
         "sudachipy",
         "sudachidict_core",
diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx
index 7fed9f40765..9213dead16b 100644
--- a/website/docs/usage/models.mdx
+++ b/website/docs/usage/models.mdx
@@ -264,18 +264,49 @@ used for training the current [Japanese pipelines](/models/ja).
 
 ### Korean language support {id="korean"}
 
-> #### mecab-ko tokenizer
+There are currently three built-in options for Korean tokenization, two based on
+[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) and one
+using the rule-based tokenizer.
+
+> #### Default mecab-ko tokenizer
 >
 > ```python
+> # uses mecab-ko-dic
 > nlp = spacy.blank("ko")
+>
+> # with custom mecab args
+> mecab_args = "-d /path/to/dicdir -u /path/to/userdic"
+> config = {"nlp": {"tokenizer": {"mecab_args": mecab_args}}}
+> nlp = spacy.blank("ko", config=config)
 > ```
 
-The default MeCab-based Korean tokenizer requires:
+The default MeCab-based Korean tokenizer requires the python package
+[`mecab-ko`](https://pypi.org/project/mecab-ko/) and no further system
+requirements.
+
+The `natto-py` MeCab-based tokenizer (the previous default for spaCy v3.4 and
+earlier) is available as `spacy.KoreanNattoTokenizer.v1`. It requires:
 
 - [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
 - [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
 - [natto-py](https://github.com/buruzaemon/natto-py)
 
+To use this tokenizer, edit `[nlp.tokenizer]` in your config:
+
+> #### natto-py MeCab-ko tokenizer
+>
+> ```python
+> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.KoreanNattoTokenizer.v1"}}}
+> nlp = spacy.blank("ko", config=config)
+> ```
+
+```ini
+### config.cfg
+[nlp]
+lang = "ko"
+tokenizer = {"@tokenizers" = "spacy.KoreanNattoTokenizer.v1"}
+```
+
 For some Korean datasets and tasks, the
 [rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
 than MeCab. To configure a Korean pipeline with the rule-based tokenizer:

From 00a38f741881157e9d425c08946e5b0c039379e3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 29 Aug 2022 13:23:24 +0200
Subject: [PATCH 011/504] Remove setup_requires from setup.cfg (#11384)

* Remove setup_requires from setup.cfg

* Update requirements test to ignore cython in setup.cfg
---
 setup.cfg                                | 13 +------------
 spacy/tests/package/test_requirements.py |  2 +-
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 887486f12d6..1dbf8f56454 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,18 +30,7 @@ project_urls =
 [options]
 zip_safe = false
 include_package_data = true
-python_requires = >=3.7
-# NOTE: This section is superseded by pyproject.toml and will be removed in
-# spaCy v4
-setup_requires =
-    cython>=0.25,<3.0
-    numpy>=1.15.0; python_version < "3.9"
-    numpy>=1.19.0; python_version >= "3.9"
-    # We also need our Cython packages here to compile against
-    cymem>=2.0.2,<2.1.0
-    preshed>=3.0.2,<3.1.0
-    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.2.2,<8.3.0
+python_requires = >=3.6
 install_requires =
     # Our libraries
     spacy-legacy>=3.0.11,<3.1.0
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index 704d4b90b44..a63b1d8b060 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -5,7 +5,7 @@
 def test_build_dependencies():
     # Check that library requirements are pinned exactly the same across different setup files.
     libs_ignore_requirements = [
-        "numpy",
+        "cython",
         "pytest",
         "pytest-timeout",
         "mock",

From a0fa0722b74a24c5d3286bd26c4e07ceab886759 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 30 Aug 2022 13:56:35 +0200
Subject: [PATCH 012/504] Make stable private modules public and adjust names
 (#11353)

* Make stable private modules public and adjust names

* `spacy.ml._character_embed` -> `spacy.ml.character_embed`
* `spacy.ml._precomputable_affine` -> `spacy.ml.precomputable_affine`
* `spacy.tokens._serialize` -> `spacy.tokens.doc_bin`
* `spacy.tokens._retokenize` -> `spacy.tokens.retokenize`
* `spacy.tokens._dict_proxies` -> `spacy.tokens.span_groups`

* Skip _precomputable_affine

* retokenize -> retokenizer

* Fix imports
---
 setup.py                                           |  2 +-
 .../ml/{_character_embed.py => character_embed.py} |  0
 spacy/ml/models/tok2vec.py                         |  6 ++++--
 spacy/pipeline/attribute_ruler.py                  |  4 ++--
 spacy/tests/pipeline/test_models.py                |  2 +-
 spacy/tests/pipeline/test_spancat.py               |  2 +-
 .../tests/serialize/test_serialize_span_groups.py  |  2 +-
 spacy/tokens/__init__.py                           |  3 ++-
 spacy/tokens/doc.pyi                               |  5 ++++-
 spacy/tokens/doc.pyx                               | 14 ++++++++++++++
 spacy/tokens/{_serialize.py => doc_bin.py}         | 11 ++++++-----
 spacy/tokens/{_retokenize.pyi => retokenizer.pyi}  |  0
 spacy/tokens/{_retokenize.pyx => retokenizer.pyx}  |  0
 spacy/tokens/{_dict_proxies.py => span_groups.py}  |  0
 14 files changed, 36 insertions(+), 15 deletions(-)
 rename spacy/ml/{_character_embed.py => character_embed.py} (100%)
 rename spacy/tokens/{_serialize.py => doc_bin.py} (97%)
 rename spacy/tokens/{_retokenize.pyi => retokenizer.pyi} (100%)
 rename spacy/tokens/{_retokenize.pyx => retokenizer.pyx} (100%)
 rename spacy/tokens/{_dict_proxies.py => span_groups.py} (100%)

diff --git a/setup.py b/setup.py
index 33178662df4..c9b4f7171e3 100755
--- a/setup.py
+++ b/setup.py
@@ -61,7 +61,7 @@
     "spacy.tokens.span_group",
     "spacy.tokens.graph",
     "spacy.tokens.morphanalysis",
-    "spacy.tokens._retokenize",
+    "spacy.tokens.retokenizer",
     "spacy.matcher.matcher",
     "spacy.matcher.phrasematcher",
     "spacy.matcher.dependencymatcher",
diff --git a/spacy/ml/_character_embed.py b/spacy/ml/character_embed.py
similarity index 100%
rename from spacy/ml/_character_embed.py
rename to spacy/ml/character_embed.py
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 0edc8999114..a605d32cd40 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -21,7 +21,9 @@
 
 from ...attrs import intify_attr
 from ...errors import Errors
-from ...ml import _character_embed
+from ...ml import character_embed
+from ..staticvectors import StaticVectors
+from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
 from ...tokens import Doc
 from ...util import registry
@@ -241,7 +243,7 @@ def CharacterEmbed(
     if feature is None:
         raise ValueError(Errors.E911.format(feat=feature))
     char_embed = chain(
-        _character_embed.CharacterEmbed(nM=nM, nC=nC),
+        character_embed.CharacterEmbed(nM=nM, nC=nC),
         cast(Model[List[Floats2d], Ragged], list2ragged()),
     )
     feature_extractor: Model[List[Doc], Ragged] = chain(
diff --git a/spacy/pipeline/attribute_ruler.py b/spacy/pipeline/attribute_ruler.py
index 8ac74d92bcd..126a48945bc 100644
--- a/spacy/pipeline/attribute_ruler.py
+++ b/spacy/pipeline/attribute_ruler.py
@@ -10,8 +10,8 @@
 from ..scorer import Scorer
 from ..symbols import IDS
 from ..tokens import Doc, Span
-from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
-from ..training import Example
+from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs
+from ..vocab import Vocab
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
 from .pipe import Pipe
diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py
index fef0017a8e1..4c0d352aa7f 100644
--- a/spacy/tests/pipeline/test_models.py
+++ b/spacy/tests/pipeline/test_models.py
@@ -8,7 +8,7 @@
 
 from spacy.lang.en import English
 from spacy.ml import FeatureExtractor, StaticVectors
-from spacy.ml._character_embed import CharacterEmbed
+from spacy.ml.character_embed import CharacterEmbed
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 9405a78e040..c143d193fa6 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -7,7 +7,7 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.tokens import SpanGroup
-from spacy.tokens._dict_proxies import SpanGroups
+from spacy.tokens.span_groups import SpanGroups
 from spacy.training import Example
 from spacy.util import fix_random_seed, make_tempdir, registry
 
diff --git a/spacy/tests/serialize/test_serialize_span_groups.py b/spacy/tests/serialize/test_serialize_span_groups.py
index 85313fcdcc3..c1c910fa137 100644
--- a/spacy/tests/serialize/test_serialize_span_groups.py
+++ b/spacy/tests/serialize/test_serialize_span_groups.py
@@ -1,7 +1,7 @@
 import pytest
 
 from spacy.tokens import Span, SpanGroup
-from spacy.tokens._dict_proxies import SpanGroups
+from spacy.tokens.span_groups import SpanGroups
 
 
 @pytest.mark.issue(10685)
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index 3393ca6eca9..e5a244360e3 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -3,6 +3,7 @@
 from .morphanalysis import MorphAnalysis
 from .span import Span
 from .span_group import SpanGroup
-from .token import Token
+from .doc_bin import DocBin
+from .morphanalysis import MorphAnalysis
 
 __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index f0b68862c32..0fae118b4b6 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -17,7 +17,10 @@ from typing import (
 import numpy as np
 from cymem.cymem import Pool
 from thinc.types import Floats1d, Floats2d, Ints2d
-
+from .span import Span
+from .token import Token
+from .span_groups import SpanGroups
+from .retokenizer import Retokenizer
 from ..lexeme import Lexeme
 from ..vocab import Vocab
 from ._dict_proxies import SpanGroups
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 50fc6e536c2..cee2eda6c53 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -20,6 +20,13 @@ from thinc.util import copy_array
 
 from .span cimport Span
 from .token cimport MISSING_DEP
+from .span_groups import SpanGroups
+from .token cimport Token
+from ..lexeme cimport Lexeme, EMPTY_LEXEME
+from ..typedefs cimport attr_t, flags_t
+from ..attrs cimport attr_id_t
+from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
+from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 
 from ._dict_proxies import SpanGroups
 
@@ -50,6 +57,13 @@ from .. import parts_of_speech, schemas, util
 from ..attrs import IDS, intify_attr
 from ..compat import copy_reg
 from ..errors import Errors, Warnings
+from ..morphology import Morphology
+from .. import util
+from .. import parts_of_speech
+from .. import schemas
+from .underscore import Underscore, get_ext_args
+from .retokenizer import Retokenizer
+from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from ..util import get_words_and_spaces
 from ._retokenize import Retokenizer
 from .underscore import Underscore, get_ext_args
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/doc_bin.py
similarity index 97%
rename from spacy/tokens/_serialize.py
rename to spacy/tokens/doc_bin.py
index 873d85835f0..8a08864d46e 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/doc_bin.py
@@ -10,11 +10,12 @@
 from ..attrs import IDS, ORTH, SPACY, intify_attr
 from ..compat import copy_reg
 from ..errors import Errors
-from ..util import SimpleFrozenList, ensure_path
-from ..vocab import Vocab
-from ._dict_proxies import SpanGroups
-from .doc import DOCBIN_ALL_ATTRS as ALL_ATTRS
-from .doc import Doc
+from ..util import ensure_path, SimpleFrozenList
+from .span_groups import SpanGroups
+
+# fmt: off
+ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
+# fmt: on
 
 
 class DocBin:
diff --git a/spacy/tokens/_retokenize.pyi b/spacy/tokens/retokenizer.pyi
similarity index 100%
rename from spacy/tokens/_retokenize.pyi
rename to spacy/tokens/retokenizer.pyi
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/retokenizer.pyx
similarity index 100%
rename from spacy/tokens/_retokenize.pyx
rename to spacy/tokens/retokenizer.pyx
diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/span_groups.py
similarity index 100%
rename from spacy/tokens/_dict_proxies.py
rename to spacy/tokens/span_groups.py

From f2080557d73f911575db2f8ab901930d750e0374 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 30 Aug 2022 22:40:31 +0900
Subject: [PATCH 013/504] Update/remove old Matcher syntax (#11370)

* Clean up old Matcher call style related stuff

In v2 Matcher.add was called with (key, on_match, *patterns). In v3 this
was changed to (key, patterns, *, on_match=None), but there were various
points where the old call syntax was documented or handled specially.
This removes all those.

The Matcher itself didn't need any code changes, as it just gives a
generic type error. However the PhraseMatcher required some changes
because it would automatically "fix" the old call style.

Surprisingly, the tokenizer was still using the old call style in one
place.

After these changes tests failed in two places:

1. one test for the "new" call style, including the "old" call style. I
   removed this test.
2. deserializing the PhraseMatcher fails because the input docs are a
   set.

I am not sure why 2 is happening - I guess it's a quirk of the
serialization format? - so for now I just convert the set to a list when
deserializing. The check that the input Docs are a List in the
PhraseMatcher is a new check, but makes it parallel with the other
Matchers, which seemed like the right thing to do.

* Add notes related to input docs / deserialization type

* Remove Typing import

* Remove old note about call style change

* Apply suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Use separate method for setting internal doc representations

In addition to the title change, this changes the internal dict to be a
defaultdict, instead of a dict with frequent use of setdefault.

* Add _add_from_arrays for unpickling

* Cleanup around adding from arrays

This moves adding to internal structures into the private batch method,
and removes the single-add method.

This has one behavioral change for `add`, in that if something is wrong
with the list of input Docs (such as one of the items not being a Doc),
valid items before the invalid one will not be added. Also the callback
will not be updated if anything is invalid. This change should not be
significant.

This also adds a test to check failure when given a non-Doc.

* Update spacy/matcher/phrasematcher.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/errors.py                            |   7 +-
 spacy/matcher/dependencymatcher.pyx        |   6 +-
 spacy/matcher/matcher.pyx                  |   6 +-
 spacy/matcher/phrasematcher.pyi            |   9 ++
 spacy/matcher/phrasematcher.pyx            | 118 ++++++++++++---------
 spacy/tests/matcher/test_phrase_matcher.py |  29 ++---
 spacy/tokenizer.pyx                        |   2 +-
 website/docs/api/matcher.mdx               |  14 ---
 website/docs/api/phrasematcher.mdx         |  22 +---
 9 files changed, 97 insertions(+), 116 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index cf9a7b7087a..146c60b6d60 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -500,7 +500,7 @@ class Errors(metaclass=ErrorsWithCodes):
             "Current DocBin: {current}\nOther DocBin: {other}")
     E169 = ("Can't find module: {module}")
     E170 = ("Cannot apply transition {name}: invalid for the current state.")
-    E171 = ("Matcher.add received invalid 'on_match' callback argument: expected "
+    E171 = ("{name}.add received invalid 'on_match' callback argument: expected "
             "callable or None, but got: {arg_type}")
     E175 = ("Can't remove rule for unknown match pattern ID: {key}")
     E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
@@ -759,7 +759,7 @@ class Errors(metaclass=ErrorsWithCodes):
             "loaded nlp object, but got: {source}")
     E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
             "a string value from {expected} but got: '{arg}'")
-    E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
+    E948 = ("`{name}.add` received invalid 'patterns' argument: expected "
             "a list, but got: {arg_type}")
     E949 = ("Unable to align tokens for the predicted and reference docs. It "
             "is only possible to align the docs when both texts are the same "
@@ -989,6 +989,9 @@ class Errors(metaclass=ErrorsWithCodes):
              "reduction. Please enable one of `use_reduce_first`, "
              "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
+    # v4 error strings
+    E4000 = ("Expected a Doc as input, but got: '{type}'")
+
 
 # Deprecated model shortcuts, only used in errors and warnings
 OLD_MODEL_SHORTCUTS = {
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index ab5f5d5d14b..0b639ab04fb 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -175,9 +175,9 @@ cdef class DependencyMatcher:
         on_match (callable): Optional callback executed on match.
         """
         if on_match is not None and not hasattr(on_match, "__call__"):
-            raise ValueError(Errors.E171.format(arg_type=type(on_match)))
-        if patterns is None or not isinstance(patterns, List):  # old API
-            raise ValueError(Errors.E948.format(arg_type=type(patterns)))
+            raise ValueError(Errors.E171.format(name="DependencyMatcher", arg_type=type(on_match)))
+        if patterns is None or not isinstance(patterns, List):
+            raise ValueError(Errors.E948.format(name="DependencyMatcher", arg_type=type(patterns)))
         for pattern in patterns:
             if len(pattern) == 0:
                 raise ValueError(Errors.E012.format(key=key))
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index f0116169a6b..715dd45f07c 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -113,9 +113,9 @@ cdef class Matcher:
         """
         errors = {}
         if on_match is not None and not hasattr(on_match, "__call__"):
-            raise ValueError(Errors.E171.format(arg_type=type(on_match)))
-        if patterns is None or not isinstance(patterns, List):  # old API
-            raise ValueError(Errors.E948.format(arg_type=type(patterns)))
+            raise ValueError(Errors.E171.format(name="Matcher", arg_type=type(on_match)))
+        if patterns is None or not isinstance(patterns, List):
+            raise ValueError(Errors.E948.format(name="Matcher", arg_type=type(patterns)))
         if greedy is not None and greedy not in ["FIRST", "LONGEST"]:
             raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=greedy))
         for i, pattern in enumerate(patterns):
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index 27f6ba373fc..f9585da7893 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -21,6 +21,15 @@ class PhraseMatcher:
             Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
         ] = ...,
     ) -> None: ...
+    def _add_from_arrays(
+        self,
+        key: str,
+        specs: List[List[int]],
+        *,
+        on_match: Optional[
+            Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
+        ] = ...,
+    ) -> None: ...
     def remove(self, key: str) -> None: ...
     @overload
     def __call__(
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 4efcdb05c43..6e3c52924fa 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,5 +1,8 @@
-# cython: infer_types=True
-from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
+# cython: infer_types=True, profile=True
+from typing import List
+from collections import defaultdict
+from libc.stdint cimport uintptr_t
+from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
 
 import warnings
 
@@ -39,7 +42,7 @@ cdef class PhraseMatcher:
         """
         self.vocab = vocab
         self._callbacks = {}
-        self._docs = {}
+        self._docs = defaultdict(set)
         self._validate = validate
 
         self.mem = Pool()
@@ -155,66 +158,24 @@ cdef class PhraseMatcher:
         del self._callbacks[key]
         del self._docs[key]
 
-    def add(self, key, docs, *_docs, on_match=None):
-        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
-        key, an on_match callback, and one or more patterns.
 
-        Since spaCy v2.2.2, PhraseMatcher.add takes a list of patterns as the
-        second argument, with the on_match callback as an optional keyword
-        argument.
+    def _add_from_arrays(self, key, specs, *, on_match=None):
+        """Add a preprocessed list of specs, with an optional callback.
 
         key (str): The match ID.
-        docs (list): List of `Doc` objects representing match patterns.
+        specs (List[List[int]]): A list of lists of hashes to match.
         on_match (callable): Callback executed on match.
-        *_docs (Doc): For backwards compatibility: list of patterns to add
-            as variable arguments. Will be ignored if a list of patterns is
-            provided as the second argument.
-
-        DOCS: https://spacy.io/api/phrasematcher#add
         """
-        if docs is None or hasattr(docs, "__call__"):  # old API
-            on_match = docs
-            docs = _docs
-
-        _ = self.vocab[key]
-        self._callbacks[key] = on_match
-        self._docs.setdefault(key, set())
-
         cdef MapStruct* current_node
         cdef MapStruct* internal_node
         cdef void* result
 
-        if isinstance(docs, Doc):
-            raise ValueError(Errors.E179.format(key=key))
-        for doc in docs:
-            if len(doc) == 0:
-                continue
-            if isinstance(doc, Doc):
-                attrs = (TAG, POS, MORPH, LEMMA, DEP)
-                has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
-                for attr in attrs:
-                    if self.attr == attr and not has_annotation[attr]:
-                        if attr == TAG:
-                            pipe = "tagger"
-                        elif attr in (POS, MORPH):
-                            pipe = "morphologizer or tagger+attribute_ruler"
-                        elif attr == LEMMA:
-                            pipe = "lemmatizer"
-                        elif attr == DEP:
-                            pipe = "parser"
-                        error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
-                        raise ValueError(error_msg)
-                if self._validate and any(has_annotation.values()) \
-                        and self.attr not in attrs:
-                    string_attr = self.vocab.strings[self.attr]
-                    warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
-                keyword = self._convert_to_array(doc)
-            else:
-                keyword = doc
-            self._docs[key].add(tuple(keyword))
+        self._callbacks[key] = on_match
+        for spec in specs:
+            self._docs[key].add(tuple(spec))
 
             current_node = self.c_map
-            for token in keyword:
+            for token in spec:
                 if token == self._terminal_hash:
                     warnings.warn(Warnings.W021)
                     break
@@ -233,6 +194,57 @@ cdef class PhraseMatcher:
                 result = internal_node
             map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
 
+
+    def add(self, key, docs, *, on_match=None):
+        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
+        key, a list of one or more patterns, and (optionally) an on_match callback.
+
+        key (str): The match ID.
+        docs (list): List of `Doc` objects representing match patterns.
+        on_match (callable): Callback executed on match.
+
+        If any of the input Docs are invalid, no internal state will be updated.
+
+        DOCS: https://spacy.io/api/phrasematcher#add
+        """
+        if isinstance(docs, Doc):
+            raise ValueError(Errors.E179.format(key=key))
+        if docs is None or not isinstance(docs, List):
+            raise ValueError(Errors.E948.format(name="PhraseMatcher", arg_type=type(docs)))
+        if on_match is not None and not hasattr(on_match, "__call__"):
+            raise ValueError(Errors.E171.format(name="PhraseMatcher", arg_type=type(on_match)))
+
+        _ = self.vocab[key]
+        specs = []
+
+        for doc in docs:
+            if len(doc) == 0:
+                continue
+            if not isinstance(doc, Doc):
+                raise ValueError(Errors.E4000.format(type=type(doc)))
+
+            attrs = (TAG, POS, MORPH, LEMMA, DEP)
+            has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
+            for attr in attrs:
+                if self.attr == attr and not has_annotation[attr]:
+                    if attr == TAG:
+                        pipe = "tagger"
+                    elif attr in (POS, MORPH):
+                        pipe = "morphologizer or tagger+attribute_ruler"
+                    elif attr == LEMMA:
+                        pipe = "lemmatizer"
+                    elif attr == DEP:
+                        pipe = "parser"
+                    error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+                    raise ValueError(error_msg)
+            if self._validate and any(has_annotation.values()) \
+                    and self.attr not in attrs:
+                string_attr = self.vocab.strings[self.attr]
+                warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
+            specs.append(self._convert_to_array(doc))
+
+        self._add_from_arrays(key, specs, on_match=on_match)
+
     def __call__(self, object doclike, *, as_spans=False):
         """Find all sequences matching the supplied patterns on the `Doc`.
 
@@ -345,7 +357,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr):
     matcher = PhraseMatcher(vocab, attr=attr)
     for key, specs in docs.items():
         callback = callbacks.get(key, None)
-        matcher.add(key, specs, on_match=callback)
+        matcher._add_from_arrays(key, specs, on_match=callback)
     return matcher
 
 
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 7335bbdf107..4ad234cba3b 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -198,28 +198,6 @@ def test_phrase_matcher_contains(en_vocab):
     assert "TEST2" not in matcher
 
 
-def test_phrase_matcher_add_new_api(en_vocab):
-    doc = Doc(en_vocab, words=["a", "b"])
-    patterns = [Doc(en_vocab, words=["a"]), Doc(en_vocab, words=["a", "b"])]
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("OLD_API", None, *patterns)
-    assert len(matcher(doc)) == 2
-    matcher = PhraseMatcher(en_vocab)
-    on_match = Mock()
-    matcher.add("OLD_API_CALLBACK", on_match, *patterns)
-    assert len(matcher(doc)) == 2
-    assert on_match.call_count == 2
-    # New API: add(key: str, patterns: List[List[dict]], on_match: Callable)
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("NEW_API", patterns)
-    assert len(matcher(doc)) == 2
-    matcher = PhraseMatcher(en_vocab)
-    on_match = Mock()
-    matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match)
-    assert len(matcher(doc)) == 2
-    assert on_match.call_count == 2
-
-
 def test_phrase_matcher_repeated_add(en_vocab):
     matcher = PhraseMatcher(en_vocab)
     # match ID only gets added once
@@ -468,6 +446,13 @@ def test_phrase_matcher_deprecated(en_vocab):
         assert "spaCy v3.0" in str(record.list[0].message)
 
 
+def test_phrase_matcher_non_doc(en_vocab):
+    matcher = PhraseMatcher(en_vocab)
+    doc = Doc(en_vocab, words=["hello", "world"])
+    with pytest.raises(ValueError):
+        matcher.add("TEST", [doc, "junk"])
+
+
 @pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"])
 def test_phrase_matcher_sent_start(en_vocab, attr):
     _ = PhraseMatcher(en_vocab, attr=attr)  # noqa: F841
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 9b79207f82e..cdb7dda7094 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -619,7 +619,7 @@ cdef class Tokenizer:
         self._rules[string] = substrings
         self._flush_cache()
         if not self.faster_heuristics or self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
-            self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
+            self._special_matcher.add(string, [self._tokenize_affixes(string, False)])
 
     def _reload_special_cases(self):
         self._flush_cache()
diff --git a/website/docs/api/matcher.mdx b/website/docs/api/matcher.mdx
index c66579da814..66954b6c4fb 100644
--- a/website/docs/api/matcher.mdx
+++ b/website/docs/api/matcher.mdx
@@ -211,20 +211,6 @@ will be overwritten.
 > matches = matcher(doc)
 > ```
 
-<Infobox title="Changed in v3.0" variant="warning">
-
-As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument
-(instead of a variable number of arguments). The `on_match` callback becomes an
-optional keyword argument.
-
-```diff
-patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
-- matcher.add("GoogleNow", on_match, *patterns)
-+ matcher.add("GoogleNow", patterns, on_match=on_match)
-```
-
-</Infobox>
-
 | Name                                | Description                                                                                                                                                |
 | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `match_id`                          | An ID for the thing you're matching. ~~str~~                                                                                                               |
diff --git a/website/docs/api/phrasematcher.mdx b/website/docs/api/phrasematcher.mdx
index 14ccefb772e..2c5e767dcba 100644
--- a/website/docs/api/phrasematcher.mdx
+++ b/website/docs/api/phrasematcher.mdx
@@ -116,10 +116,10 @@ Check whether the matcher contains rules for a match ID.
 ## PhraseMatcher.add {id="add",tag="method"}
 
 Add a rule to the matcher, consisting of an ID key, one or more patterns, and a
-callback function to act on the matches. The callback function will receive the
-arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for
-the given ID, the patterns will be extended. An `on_match` callback will be
-overwritten.
+optional callback function to act on the matches. The callback function will
+receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
+exists for the given ID, the patterns will be extended. An `on_match` callback
+will be overwritten.
 
 > #### Example
 >
@@ -134,20 +134,6 @@ overwritten.
 >   matches = matcher(doc)
 > ```
 
-<Infobox title="Changed in v3.0" variant="warning">
-
-As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second
-argument (instead of a variable number of arguments). The `on_match` callback
-becomes an optional keyword argument.
-
-```diff
-patterns = [nlp("health care reform"), nlp("healthcare reform")]
-- matcher.add("HEALTH", on_match, *patterns)
-+ matcher.add("HEALTH", patterns, on_match=on_match)
-```
-
-</Infobox>
-
 | Name           | Description                                                                                                                                                |
 | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `key`          | An ID for the thing you're matching. ~~str~~                                                                                                               |

From 6e6c5a7c716ea63aba62c165702acc50dda34ed4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 2 Sep 2022 09:08:40 +0200
Subject: [PATCH 014/504] Consolidate and freeze symbols (#11352)

* Consolidate and freeze symbols

Instead of having symbol values defined in three potentially conflicting
places (`spacy.attrs`, `spacy.parts_of_speech`, `spacy.symbols`), define
all symbols in `spacy.symbols` and reference those values in
`spacy.attrs` and `spacy.parts_of_speech`.

Remove deprecated and placeholder symbols from `spacy.attrs.IDS`.

Make `spacy.attrs.NAMES` and `spacy.symbols.NAMES` reverse dicts rather
than lists in order to support future use of hash values in `attr_id_t`.

Minor changes:

* Use `uint64_t` for attrs in `Doc.to_array` to support future use of
hash values
* Remove unneeded attrs filter for error message in `Doc.to_array`
* Remove unused attr `SENT_END`

* Handle dynamic size of attr_id_t in Doc.to_array

* Undo added warnings

* Refactor to make Doc.to_array more similar to Doc.from_array

* Improve refactoring
---
 spacy/attrs.pxd             | 129 +++-------
 spacy/attrs.pyx             |  49 +---
 spacy/parts_of_speech.pxd   |  38 +--
 spacy/schemas.py            |   2 +-
 spacy/strings.pyx           |   4 +-
 spacy/symbols.pxd           |  15 +-
 spacy/symbols.pyx           |   6 +-
 spacy/tests/test_symbols.py | 467 ++++++++++++++++++++++++++++++++++++
 spacy/tokens/doc.pyx        |  20 +-
 9 files changed, 551 insertions(+), 179 deletions(-)
 create mode 100644 spacy/tests/test_symbols.py

diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index fbbac0ec29c..b8972cb714e 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -1,99 +1,50 @@
-# Reserve 64 values for flag features
 from . cimport symbols
 
 
 cdef enum attr_id_t:
-    NULL_ATTR
-    IS_ALPHA
-    IS_ASCII
-    IS_DIGIT
-    IS_LOWER
-    IS_PUNCT
-    IS_SPACE
-    IS_TITLE
-    IS_UPPER
-    LIKE_URL
-    LIKE_NUM
-    LIKE_EMAIL
-    IS_STOP
-    IS_OOV_DEPRECATED
-    IS_BRACKET
-    IS_QUOTE
-    IS_LEFT_PUNCT
-    IS_RIGHT_PUNCT
-    IS_CURRENCY
+    NULL_ATTR = 0
+    IS_ALPHA = symbols.IS_ALPHA
+    IS_ASCII = symbols.IS_ASCII
+    IS_DIGIT = symbols.IS_DIGIT
+    IS_LOWER = symbols.IS_LOWER
+    IS_PUNCT = symbols.IS_PUNCT
+    IS_SPACE = symbols.IS_SPACE
+    IS_TITLE = symbols.IS_TITLE
+    IS_UPPER = symbols.IS_UPPER
+    LIKE_URL = symbols.LIKE_URL
+    LIKE_NUM = symbols.LIKE_NUM
+    LIKE_EMAIL = symbols.LIKE_EMAIL
+    IS_STOP = symbols.IS_STOP
+    IS_BRACKET = symbols.IS_BRACKET
+    IS_QUOTE = symbols.IS_QUOTE
+    IS_LEFT_PUNCT = symbols.IS_LEFT_PUNCT
+    IS_RIGHT_PUNCT = symbols.IS_RIGHT_PUNCT
+    IS_CURRENCY = symbols.IS_CURRENCY
 
-    FLAG19 = 19
-    FLAG20
-    FLAG21
-    FLAG22
-    FLAG23
-    FLAG24
-    FLAG25
-    FLAG26
-    FLAG27
-    FLAG28
-    FLAG29
-    FLAG30
-    FLAG31
-    FLAG32
-    FLAG33
-    FLAG34
-    FLAG35
-    FLAG36
-    FLAG37
-    FLAG38
-    FLAG39
-    FLAG40
-    FLAG41
-    FLAG42
-    FLAG43
-    FLAG44
-    FLAG45
-    FLAG46
-    FLAG47
-    FLAG48
-    FLAG49
-    FLAG50
-    FLAG51
-    FLAG52
-    FLAG53
-    FLAG54
-    FLAG55
-    FLAG56
-    FLAG57
-    FLAG58
-    FLAG59
-    FLAG60
-    FLAG61
-    FLAG62
-    FLAG63
+    ID = symbols.ID
+    ORTH = symbols.ORTH
+    LOWER = symbols.LOWER
+    NORM = symbols.NORM
+    SHAPE = symbols.SHAPE
+    PREFIX = symbols.PREFIX
+    SUFFIX = symbols.SUFFIX
 
-    ID
-    ORTH
-    LOWER
-    NORM
-    SHAPE
-    PREFIX
-    SUFFIX
+    LENGTH = symbols.LENGTH
+    CLUSTER = symbols.CLUSTER
+    LEMMA = symbols.LEMMA
+    POS = symbols.POS
+    TAG = symbols.TAG
+    DEP = symbols.DEP
+    ENT_IOB = symbols.ENT_IOB
+    ENT_TYPE = symbols.ENT_TYPE
+    HEAD = symbols.HEAD
+    SENT_START = symbols.SENT_START
+    SPACY = symbols.SPACY
+    PROB = symbols.PROB
 
-    LENGTH
-    CLUSTER
-    LEMMA
-    POS
-    TAG
-    DEP
-    ENT_IOB
-    ENT_TYPE
-    HEAD
-    SENT_START
-    SPACY
-    PROB
-
-    LANG
+    LANG = symbols.LANG
     ENT_KB_ID = symbols.ENT_KB_ID
-    MORPH
+    MORPH = symbols.MORPH
     ENT_ID = symbols.ENT_ID
 
-    IDX
-    SENT_END
+    IDX = symbols.IDX
diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 0a4aecc5d85..1688afe47af 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -17,57 +17,11 @@ IDS = {
     "LIKE_NUM": LIKE_NUM,
     "LIKE_EMAIL": LIKE_EMAIL,
     "IS_STOP": IS_STOP,
-    "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
     "IS_BRACKET": IS_BRACKET,
     "IS_QUOTE": IS_QUOTE,
     "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
     "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
     "IS_CURRENCY": IS_CURRENCY,
-    "FLAG19": FLAG19,
-    "FLAG20": FLAG20,
-    "FLAG21": FLAG21,
-    "FLAG22": FLAG22,
-    "FLAG23": FLAG23,
-    "FLAG24": FLAG24,
-    "FLAG25": FLAG25,
-    "FLAG26": FLAG26,
-    "FLAG27": FLAG27,
-    "FLAG28": FLAG28,
-    "FLAG29": FLAG29,
-    "FLAG30": FLAG30,
-    "FLAG31": FLAG31,
-    "FLAG32": FLAG32,
-    "FLAG33": FLAG33,
-    "FLAG34": FLAG34,
-    "FLAG35": FLAG35,
-    "FLAG36": FLAG36,
-    "FLAG37": FLAG37,
-    "FLAG38": FLAG38,
-    "FLAG39": FLAG39,
-    "FLAG40": FLAG40,
-    "FLAG41": FLAG41,
-    "FLAG42": FLAG42,
-    "FLAG43": FLAG43,
-    "FLAG44": FLAG44,
-    "FLAG45": FLAG45,
-    "FLAG46": FLAG46,
-    "FLAG47": FLAG47,
-    "FLAG48": FLAG48,
-    "FLAG49": FLAG49,
-    "FLAG50": FLAG50,
-    "FLAG51": FLAG51,
-    "FLAG52": FLAG52,
-    "FLAG53": FLAG53,
-    "FLAG54": FLAG54,
-    "FLAG55": FLAG55,
-    "FLAG56": FLAG56,
-    "FLAG57": FLAG57,
-    "FLAG58": FLAG58,
-    "FLAG59": FLAG59,
-    "FLAG60": FLAG60,
-    "FLAG61": FLAG61,
-    "FLAG62": FLAG62,
-    "FLAG63": FLAG63,
     "ID": ID,
     "ORTH": ORTH,
     "LOWER": LOWER,
@@ -93,8 +47,7 @@ IDS = {
 }
 
 
-# ATTR IDs, in order of the symbol
-NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
+NAMES = {v: k for k, v in IDS.items()}
 locals().update(IDS)
 
 
diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index b5423d11301..01f116ea688 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -4,22 +4,22 @@ from . cimport symbols
 cpdef enum univ_pos_t:
     NO_TAG = 0
     ADJ = symbols.ADJ
-    ADP
-    ADV
-    AUX
-    CONJ
-    CCONJ  # U20
-    DET
-    INTJ
-    NOUN
-    NUM
-    PART
-    PRON
-    PROPN
-    PUNCT
-    SCONJ
-    SYM
-    VERB
-    X
-    EOL
-    SPACE
+    ADP = symbols.ADP
+    ADV = symbols.ADV
+    AUX = symbols.AUX
+    CONJ = symbols.CONJ
+    CCONJ  = symbols.CCONJ  # U20
+    DET = symbols.DET
+    INTJ = symbols.INTJ
+    NOUN = symbols.NOUN
+    NUM = symbols.NUM
+    PART = symbols.PART
+    PRON = symbols.PRON
+    PROPN = symbols.PROPN
+    PUNCT = symbols.PUNCT
+    SCONJ = symbols.SCONJ
+    SYM = symbols.SYM
+    VERB = symbols.VERB
+    X = symbols.X
+    EOL = symbols.EOL
+    SPACE = symbols.SPACE
diff --git a/spacy/schemas.py b/spacy/schemas.py
index fa987b90f19..9a2b5ed60e9 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -181,7 +181,7 @@ def validate_init_settings(
 
 def validate_token_pattern(obj: list) -> List[str]:
     # Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"})
-    get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k
+    get_key = lambda k: NAMES[k] if isinstance(k, int) and k in NAMES else k
     if isinstance(obj, list):
         converted = []
         for pattern in obj:
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 376a131751e..e73b66dff54 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -148,7 +148,7 @@ cdef class StringStore:
         elif _try_coerce_to_hash(string_or_id, &str_hash):
             if str_hash == 0:
                 return ""
-            elif str_hash < len(SYMBOLS_BY_INT):
+            elif str_hash in SYMBOLS_BY_INT:
                 return SYMBOLS_BY_INT[str_hash]
             else:
                 utf8str = <Utf8Str*>self._map.get(str_hash)
@@ -224,7 +224,7 @@ cdef class StringStore:
             # TODO: Raise an error instead
             return self._map.get(string_or_id) is not NULL
 
-        if str_hash < len(SYMBOLS_BY_INT):
+        if str_hash in SYMBOLS_BY_INT:
             return True
         else:
             return self._map.get(str_hash) is not NULL
diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index 73be19145b2..9e74bf67620 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -1,5 +1,6 @@
+# DO NOT EDIT! The symbols are frozen as of spaCy v3.0.0.
 cdef enum symbol_t:
-    NIL
+    NIL = 0
     IS_ALPHA
     IS_ASCII
     IS_DIGIT
@@ -65,7 +66,7 @@ cdef enum symbol_t:
     FLAG62
     FLAG63
 
-    ID
+    ID = 64
     ORTH
     LOWER
     NORM
@@ -385,7 +386,7 @@ cdef enum symbol_t:
     DEPRECATED275
     DEPRECATED276
 
-    PERSON
+    PERSON = 380
     NORP
     FACILITY
     ORG
@@ -405,7 +406,7 @@ cdef enum symbol_t:
     ORDINAL
     CARDINAL
 
-    acomp
+    acomp = 398
     advcl
     advmod
     agent
@@ -458,12 +459,12 @@ cdef enum symbol_t:
     rcmod
     root
     xcomp
-
     acl
 
-    ENT_KB_ID
+    ENT_KB_ID = 452
     MORPH
     ENT_ID
 
     IDX
-    _
+    _ = 456
+    # DO NOT ADD ANY NEW SYMBOLS!
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index f7713577bd3..d2a8a428954 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -470,11 +470,7 @@ IDS = {
 }
 
 
-def sort_nums(x):
-    return x[1]
-
-
-NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
+NAMES = {v: k for k, v in IDS.items()}
 # Unfortunate hack here, to work around problem with long cpdef enum
 # (which is generating an enormous amount of C++ in Cython 0.24+)
 # We keep the enum cdef, and just make sure the names are available to Python
diff --git a/spacy/tests/test_symbols.py b/spacy/tests/test_symbols.py
new file mode 100644
index 00000000000..fb034accac2
--- /dev/null
+++ b/spacy/tests/test_symbols.py
@@ -0,0 +1,467 @@
+import pytest
+from spacy.symbols import IDS, NAMES
+
+V3_SYMBOLS = {
+    "": 0,
+    "IS_ALPHA": 1,
+    "IS_ASCII": 2,
+    "IS_DIGIT": 3,
+    "IS_LOWER": 4,
+    "IS_PUNCT": 5,
+    "IS_SPACE": 6,
+    "IS_TITLE": 7,
+    "IS_UPPER": 8,
+    "LIKE_URL": 9,
+    "LIKE_NUM": 10,
+    "LIKE_EMAIL": 11,
+    "IS_STOP": 12,
+    "IS_OOV_DEPRECATED": 13,
+    "IS_BRACKET": 14,
+    "IS_QUOTE": 15,
+    "IS_LEFT_PUNCT": 16,
+    "IS_RIGHT_PUNCT": 17,
+    "IS_CURRENCY": 18,
+    "FLAG19": 19,
+    "FLAG20": 20,
+    "FLAG21": 21,
+    "FLAG22": 22,
+    "FLAG23": 23,
+    "FLAG24": 24,
+    "FLAG25": 25,
+    "FLAG26": 26,
+    "FLAG27": 27,
+    "FLAG28": 28,
+    "FLAG29": 29,
+    "FLAG30": 30,
+    "FLAG31": 31,
+    "FLAG32": 32,
+    "FLAG33": 33,
+    "FLAG34": 34,
+    "FLAG35": 35,
+    "FLAG36": 36,
+    "FLAG37": 37,
+    "FLAG38": 38,
+    "FLAG39": 39,
+    "FLAG40": 40,
+    "FLAG41": 41,
+    "FLAG42": 42,
+    "FLAG43": 43,
+    "FLAG44": 44,
+    "FLAG45": 45,
+    "FLAG46": 46,
+    "FLAG47": 47,
+    "FLAG48": 48,
+    "FLAG49": 49,
+    "FLAG50": 50,
+    "FLAG51": 51,
+    "FLAG52": 52,
+    "FLAG53": 53,
+    "FLAG54": 54,
+    "FLAG55": 55,
+    "FLAG56": 56,
+    "FLAG57": 57,
+    "FLAG58": 58,
+    "FLAG59": 59,
+    "FLAG60": 60,
+    "FLAG61": 61,
+    "FLAG62": 62,
+    "FLAG63": 63,
+    "ID": 64,
+    "ORTH": 65,
+    "LOWER": 66,
+    "NORM": 67,
+    "SHAPE": 68,
+    "PREFIX": 69,
+    "SUFFIX": 70,
+    "LENGTH": 71,
+    "CLUSTER": 72,
+    "LEMMA": 73,
+    "POS": 74,
+    "TAG": 75,
+    "DEP": 76,
+    "ENT_IOB": 77,
+    "ENT_TYPE": 78,
+    "ENT_ID": 454,
+    "ENT_KB_ID": 452,
+    "HEAD": 79,
+    "SENT_START": 80,
+    "SPACY": 81,
+    "PROB": 82,
+    "LANG": 83,
+    "IDX": 455,
+    "ADJ": 84,
+    "ADP": 85,
+    "ADV": 86,
+    "AUX": 87,
+    "CONJ": 88,
+    "CCONJ": 89,
+    "DET": 90,
+    "INTJ": 91,
+    "NOUN": 92,
+    "NUM": 93,
+    "PART": 94,
+    "PRON": 95,
+    "PROPN": 96,
+    "PUNCT": 97,
+    "SCONJ": 98,
+    "SYM": 99,
+    "VERB": 100,
+    "X": 101,
+    "EOL": 102,
+    "SPACE": 103,
+    "DEPRECATED001": 104,
+    "DEPRECATED002": 105,
+    "DEPRECATED003": 106,
+    "DEPRECATED004": 107,
+    "DEPRECATED005": 108,
+    "DEPRECATED006": 109,
+    "DEPRECATED007": 110,
+    "DEPRECATED008": 111,
+    "DEPRECATED009": 112,
+    "DEPRECATED010": 113,
+    "DEPRECATED011": 114,
+    "DEPRECATED012": 115,
+    "DEPRECATED013": 116,
+    "DEPRECATED014": 117,
+    "DEPRECATED015": 118,
+    "DEPRECATED016": 119,
+    "DEPRECATED017": 120,
+    "DEPRECATED018": 121,
+    "DEPRECATED019": 122,
+    "DEPRECATED020": 123,
+    "DEPRECATED021": 124,
+    "DEPRECATED022": 125,
+    "DEPRECATED023": 126,
+    "DEPRECATED024": 127,
+    "DEPRECATED025": 128,
+    "DEPRECATED026": 129,
+    "DEPRECATED027": 130,
+    "DEPRECATED028": 131,
+    "DEPRECATED029": 132,
+    "DEPRECATED030": 133,
+    "DEPRECATED031": 134,
+    "DEPRECATED032": 135,
+    "DEPRECATED033": 136,
+    "DEPRECATED034": 137,
+    "DEPRECATED035": 138,
+    "DEPRECATED036": 139,
+    "DEPRECATED037": 140,
+    "DEPRECATED038": 141,
+    "DEPRECATED039": 142,
+    "DEPRECATED040": 143,
+    "DEPRECATED041": 144,
+    "DEPRECATED042": 145,
+    "DEPRECATED043": 146,
+    "DEPRECATED044": 147,
+    "DEPRECATED045": 148,
+    "DEPRECATED046": 149,
+    "DEPRECATED047": 150,
+    "DEPRECATED048": 151,
+    "DEPRECATED049": 152,
+    "DEPRECATED050": 153,
+    "DEPRECATED051": 154,
+    "DEPRECATED052": 155,
+    "DEPRECATED053": 156,
+    "DEPRECATED054": 157,
+    "DEPRECATED055": 158,
+    "DEPRECATED056": 159,
+    "DEPRECATED057": 160,
+    "DEPRECATED058": 161,
+    "DEPRECATED059": 162,
+    "DEPRECATED060": 163,
+    "DEPRECATED061": 164,
+    "DEPRECATED062": 165,
+    "DEPRECATED063": 166,
+    "DEPRECATED064": 167,
+    "DEPRECATED065": 168,
+    "DEPRECATED066": 169,
+    "DEPRECATED067": 170,
+    "DEPRECATED068": 171,
+    "DEPRECATED069": 172,
+    "DEPRECATED070": 173,
+    "DEPRECATED071": 174,
+    "DEPRECATED072": 175,
+    "DEPRECATED073": 176,
+    "DEPRECATED074": 177,
+    "DEPRECATED075": 178,
+    "DEPRECATED076": 179,
+    "DEPRECATED077": 180,
+    "DEPRECATED078": 181,
+    "DEPRECATED079": 182,
+    "DEPRECATED080": 183,
+    "DEPRECATED081": 184,
+    "DEPRECATED082": 185,
+    "DEPRECATED083": 186,
+    "DEPRECATED084": 187,
+    "DEPRECATED085": 188,
+    "DEPRECATED086": 189,
+    "DEPRECATED087": 190,
+    "DEPRECATED088": 191,
+    "DEPRECATED089": 192,
+    "DEPRECATED090": 193,
+    "DEPRECATED091": 194,
+    "DEPRECATED092": 195,
+    "DEPRECATED093": 196,
+    "DEPRECATED094": 197,
+    "DEPRECATED095": 198,
+    "DEPRECATED096": 199,
+    "DEPRECATED097": 200,
+    "DEPRECATED098": 201,
+    "DEPRECATED099": 202,
+    "DEPRECATED100": 203,
+    "DEPRECATED101": 204,
+    "DEPRECATED102": 205,
+    "DEPRECATED103": 206,
+    "DEPRECATED104": 207,
+    "DEPRECATED105": 208,
+    "DEPRECATED106": 209,
+    "DEPRECATED107": 210,
+    "DEPRECATED108": 211,
+    "DEPRECATED109": 212,
+    "DEPRECATED110": 213,
+    "DEPRECATED111": 214,
+    "DEPRECATED112": 215,
+    "DEPRECATED113": 216,
+    "DEPRECATED114": 217,
+    "DEPRECATED115": 218,
+    "DEPRECATED116": 219,
+    "DEPRECATED117": 220,
+    "DEPRECATED118": 221,
+    "DEPRECATED119": 222,
+    "DEPRECATED120": 223,
+    "DEPRECATED121": 224,
+    "DEPRECATED122": 225,
+    "DEPRECATED123": 226,
+    "DEPRECATED124": 227,
+    "DEPRECATED125": 228,
+    "DEPRECATED126": 229,
+    "DEPRECATED127": 230,
+    "DEPRECATED128": 231,
+    "DEPRECATED129": 232,
+    "DEPRECATED130": 233,
+    "DEPRECATED131": 234,
+    "DEPRECATED132": 235,
+    "DEPRECATED133": 236,
+    "DEPRECATED134": 237,
+    "DEPRECATED135": 238,
+    "DEPRECATED136": 239,
+    "DEPRECATED137": 240,
+    "DEPRECATED138": 241,
+    "DEPRECATED139": 242,
+    "DEPRECATED140": 243,
+    "DEPRECATED141": 244,
+    "DEPRECATED142": 245,
+    "DEPRECATED143": 246,
+    "DEPRECATED144": 247,
+    "DEPRECATED145": 248,
+    "DEPRECATED146": 249,
+    "DEPRECATED147": 250,
+    "DEPRECATED148": 251,
+    "DEPRECATED149": 252,
+    "DEPRECATED150": 253,
+    "DEPRECATED151": 254,
+    "DEPRECATED152": 255,
+    "DEPRECATED153": 256,
+    "DEPRECATED154": 257,
+    "DEPRECATED155": 258,
+    "DEPRECATED156": 259,
+    "DEPRECATED157": 260,
+    "DEPRECATED158": 261,
+    "DEPRECATED159": 262,
+    "DEPRECATED160": 263,
+    "DEPRECATED161": 264,
+    "DEPRECATED162": 265,
+    "DEPRECATED163": 266,
+    "DEPRECATED164": 267,
+    "DEPRECATED165": 268,
+    "DEPRECATED166": 269,
+    "DEPRECATED167": 270,
+    "DEPRECATED168": 271,
+    "DEPRECATED169": 272,
+    "DEPRECATED170": 273,
+    "DEPRECATED171": 274,
+    "DEPRECATED172": 275,
+    "DEPRECATED173": 276,
+    "DEPRECATED174": 277,
+    "DEPRECATED175": 278,
+    "DEPRECATED176": 279,
+    "DEPRECATED177": 280,
+    "DEPRECATED178": 281,
+    "DEPRECATED179": 282,
+    "DEPRECATED180": 283,
+    "DEPRECATED181": 284,
+    "DEPRECATED182": 285,
+    "DEPRECATED183": 286,
+    "DEPRECATED184": 287,
+    "DEPRECATED185": 288,
+    "DEPRECATED186": 289,
+    "DEPRECATED187": 290,
+    "DEPRECATED188": 291,
+    "DEPRECATED189": 292,
+    "DEPRECATED190": 293,
+    "DEPRECATED191": 294,
+    "DEPRECATED192": 295,
+    "DEPRECATED193": 296,
+    "DEPRECATED194": 297,
+    "DEPRECATED195": 298,
+    "DEPRECATED196": 299,
+    "DEPRECATED197": 300,
+    "DEPRECATED198": 301,
+    "DEPRECATED199": 302,
+    "DEPRECATED200": 303,
+    "DEPRECATED201": 304,
+    "DEPRECATED202": 305,
+    "DEPRECATED203": 306,
+    "DEPRECATED204": 307,
+    "DEPRECATED205": 308,
+    "DEPRECATED206": 309,
+    "DEPRECATED207": 310,
+    "DEPRECATED208": 311,
+    "DEPRECATED209": 312,
+    "DEPRECATED210": 313,
+    "DEPRECATED211": 314,
+    "DEPRECATED212": 315,
+    "DEPRECATED213": 316,
+    "DEPRECATED214": 317,
+    "DEPRECATED215": 318,
+    "DEPRECATED216": 319,
+    "DEPRECATED217": 320,
+    "DEPRECATED218": 321,
+    "DEPRECATED219": 322,
+    "DEPRECATED220": 323,
+    "DEPRECATED221": 324,
+    "DEPRECATED222": 325,
+    "DEPRECATED223": 326,
+    "DEPRECATED224": 327,
+    "DEPRECATED225": 328,
+    "DEPRECATED226": 329,
+    "DEPRECATED227": 330,
+    "DEPRECATED228": 331,
+    "DEPRECATED229": 332,
+    "DEPRECATED230": 333,
+    "DEPRECATED231": 334,
+    "DEPRECATED232": 335,
+    "DEPRECATED233": 336,
+    "DEPRECATED234": 337,
+    "DEPRECATED235": 338,
+    "DEPRECATED236": 339,
+    "DEPRECATED237": 340,
+    "DEPRECATED238": 341,
+    "DEPRECATED239": 342,
+    "DEPRECATED240": 343,
+    "DEPRECATED241": 344,
+    "DEPRECATED242": 345,
+    "DEPRECATED243": 346,
+    "DEPRECATED244": 347,
+    "DEPRECATED245": 348,
+    "DEPRECATED246": 349,
+    "DEPRECATED247": 350,
+    "DEPRECATED248": 351,
+    "DEPRECATED249": 352,
+    "DEPRECATED250": 353,
+    "DEPRECATED251": 354,
+    "DEPRECATED252": 355,
+    "DEPRECATED253": 356,
+    "DEPRECATED254": 357,
+    "DEPRECATED255": 358,
+    "DEPRECATED256": 359,
+    "DEPRECATED257": 360,
+    "DEPRECATED258": 361,
+    "DEPRECATED259": 362,
+    "DEPRECATED260": 363,
+    "DEPRECATED261": 364,
+    "DEPRECATED262": 365,
+    "DEPRECATED263": 366,
+    "DEPRECATED264": 367,
+    "DEPRECATED265": 368,
+    "DEPRECATED266": 369,
+    "DEPRECATED267": 370,
+    "DEPRECATED268": 371,
+    "DEPRECATED269": 372,
+    "DEPRECATED270": 373,
+    "DEPRECATED271": 374,
+    "DEPRECATED272": 375,
+    "DEPRECATED273": 376,
+    "DEPRECATED274": 377,
+    "DEPRECATED275": 378,
+    "DEPRECATED276": 379,
+    "PERSON": 380,
+    "NORP": 381,
+    "FACILITY": 382,
+    "ORG": 383,
+    "GPE": 384,
+    "LOC": 385,
+    "PRODUCT": 386,
+    "EVENT": 387,
+    "WORK_OF_ART": 388,
+    "LANGUAGE": 389,
+    "DATE": 391,
+    "TIME": 392,
+    "PERCENT": 393,
+    "MONEY": 394,
+    "QUANTITY": 395,
+    "ORDINAL": 396,
+    "CARDINAL": 397,
+    "acomp": 398,
+    "advcl": 399,
+    "advmod": 400,
+    "agent": 401,
+    "amod": 402,
+    "appos": 403,
+    "attr": 404,
+    "aux": 405,
+    "auxpass": 406,
+    "cc": 407,
+    "ccomp": 408,
+    "complm": 409,
+    "conj": 410,
+    "cop": 411,
+    "csubj": 412,
+    "csubjpass": 413,
+    "dep": 414,
+    "det": 415,
+    "dobj": 416,
+    "expl": 417,
+    "hmod": 418,
+    "hyph": 419,
+    "infmod": 420,
+    "intj": 421,
+    "iobj": 422,
+    "mark": 423,
+    "meta": 424,
+    "neg": 425,
+    "nmod": 426,
+    "nn": 427,
+    "npadvmod": 428,
+    "nsubj": 429,
+    "nsubjpass": 430,
+    "num": 431,
+    "number": 432,
+    "oprd": 433,
+    "obj": 434,
+    "obl": 435,
+    "parataxis": 436,
+    "partmod": 437,
+    "pcomp": 438,
+    "pobj": 439,
+    "poss": 440,
+    "possessive": 441,
+    "preconj": 442,
+    "prep": 443,
+    "prt": 444,
+    "punct": 445,
+    "quantmod": 446,
+    "rcmod": 448,
+    "relcl": 447,
+    "root": 449,
+    "xcomp": 450,
+    "acl": 451,
+    "LAW": 390,
+    "MORPH": 453,
+    "_": 456,
+}
+
+
+def test_frozen_symbols():
+    assert IDS == V3_SYMBOLS
+    assert NAMES == {v: k for k, v in IDS.items()}
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index cee2eda6c53..8db8c1d6f37 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1022,22 +1022,26 @@ cdef class Doc:
                 for id_ in py_attr_ids
             ]
         except KeyError as msg:
-            keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
+            keys = list(IDS.keys())
             raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
         # Make an array from the attributes --- otherwise our inner loop is
         # Python dict iteration.
-        cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i")
-        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
+        cdef Pool mem = Pool()
+        cdef int n_attrs = len(py_attr_ids)
+        cdef attr_id_t* c_attr_ids
+        if n_attrs > 0:
+            c_attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
+            for i, attr_id in enumerate(py_attr_ids):
+                c_attr_ids[i] = attr_id
+        output = numpy.ndarray(shape=(self.length, n_attrs), dtype=numpy.uint64)
         c_output = <attr_t*>output.data
-        c_attr_ids = <attr_id_t*>attr_ids.data
         cdef TokenC* token
-        cdef int nr_attr = attr_ids.shape[0]
         for i in range(self.length):
             token = &self.c[i]
-            for j in range(nr_attr):
-                c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j])
+            for j in range(n_attrs):
+                c_output[i*n_attrs + j] = get_token_attr(token, c_attr_ids[j])
         # Handle 1d case
-        return output if len(attr_ids) >= 2 else output.reshape((self.length,))
+        return output if n_attrs >= 2 else output.reshape((self.length,))
 
     def count_by(self, attr_id_t attr_id, exclude=None, object counts=None):
         """Count the frequencies of a given attribute. Produces a dict of

From bc106789618fc5e8eeeffcb5d445d380f3ee08df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 13 Sep 2022 09:51:12 +0200
Subject: [PATCH 015/504] Store activations in `Doc`s when `save_activations`
 is enabled (#11002)

* Store activations in Doc when `store_activations` is enabled

This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.

As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.

* Change type of `store_activations` to `Union[bool, List[str]]`

When the value is:

- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored

* Formatting fixes in Tagger

* Support store_activations in spancat and morphologizer

* Make Doc.activations type visible to MyPy

* textcat/textcat_multilabel: add store_activations option

* trainable_lemmatizer/entity_linker: add store_activations option

* parser/ner: do not currently support returning activations

* Extend tagger and senter tests

So that they, like the other tests, also check that we get no
activations if no activations were requested.

* Document `Doc.activations` and `store_activations` in the relevant pipes

* Start errors/warnings at higher numbers to avoid merge conflicts

Between the master and v4 branches.

* Add `store_activations` to docstrings.

* Replace store_activations setter by set_store_activations method

Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.

* Use dict comprehension suggested by @svlandeg

* Revert "Use dict comprehension suggested by @svlandeg"

This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.

* EntityLinker: add type annotations to _add_activations

* _store_activations: make kwarg-only, remove doc_scores_lens arg

* set_annotations: add type annotations

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* TextCat.predict: return dict

* Make the `TrainablePipe.store_activations` property a bool

This means that we can also bring back `store_activations` setter.

* Remove `TrainablePipe.activations`

We do not need to enumerate the activations anymore since `store_activations` is
`bool`.

* Add type annotations for activations in predict/set_annotations

* Rename `TrainablePipe.store_activations` to `save_activations`

* Error E1400 is not used anymore

This error was used when activations were still `Union[bool, List[str]]`.

* Change wording in API docs after store -> save change

* docs: tag (save_)activations as new in spaCy 4.0

* Fix copied line in morphologizer activations test

* Don't train in any test_save_activations test

* Rename activations

- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
  "guesses" -> "tree_ids".

* Remove unused W400 warning.

This warning was used when we still allowed the user to specify
which activations to save.

* Formatting fixes

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Replace "kb_ids" by a constant

* spancat: replace a cast by an assertion

* Fix EOF spacing

* Fix comments in test_save_activations tests

* Do not set RNG seed in activation saving tests

* Revert "spancat: replace a cast by an assertion"

This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/edit_tree_lemmatizer.py        |  46 ++--
 spacy/pipeline/entity_linker.py               | 244 +++++++++++-------
 spacy/pipeline/morphologizer.pyx              |  37 ++-
 spacy/pipeline/senter.pyx                     |  38 ++-
 spacy/pipeline/spancat.py                     |  84 +++---
 spacy/pipeline/tagger.pyx                     |  43 ++-
 spacy/pipeline/textcat.py                     |  37 ++-
 spacy/pipeline/textcat_multilabel.py          |  23 +-
 spacy/pipeline/trainable_pipe.pxd             |   1 +
 spacy/pipeline/trainable_pipe.pyx             |  14 +-
 .../pipeline/test_edit_tree_lemmatizer.py     |  26 ++
 spacy/tests/pipeline/test_entity_linker.py    |  78 ++++--
 spacy/tests/pipeline/test_morphologizer.py    |  26 +-
 spacy/tests/pipeline/test_senter.py           |  25 ++
 spacy/tests/pipeline/test_spancat.py          |  34 +--
 spacy/tests/pipeline/test_tagger.py           |  24 +-
 spacy/tests/pipeline/test_textcat.py          |  64 +++--
 spacy/tokens/doc.pxd                          |   2 +
 spacy/tokens/doc.pyi                          |   3 +-
 spacy/tokens/doc.pyx                          |   1 +
 website/docs/api/doc.mdx                      |  33 +--
 website/docs/api/edittreelemmatizer.mdx       |  17 +-
 website/docs/api/entitylinker.mdx             |  29 +--
 website/docs/api/morphologizer.mdx            |  18 +-
 website/docs/api/sentencerecognizer.mdx       |  11 +-
 website/docs/api/spancategorizer.mdx          |  35 +--
 website/docs/api/tagger.mdx                   |  14 +-
 website/docs/api/textcategorizer.mdx          |  17 +-
 28 files changed, 669 insertions(+), 355 deletions(-)

diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 4a6174bc3d8..2ef639cad52 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -4,8 +4,8 @@
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints2d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
 from ..errors import Errors
@@ -22,6 +22,9 @@
 TOP_K_GUARDRAIL = 20
 
 
+ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
+
+
 default_model_config = """
 [model]
 @architectures = "spacy.Tagger.v2"
@@ -50,6 +53,7 @@
         "overwrite": False,
         "top_k": 1,
         "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+        "save_activations": False,
     },
     default_score_weights={"lemma_acc": 1.0},
 )
@@ -62,6 +66,7 @@ def make_edit_tree_lemmatizer(
     overwrite: bool,
     top_k: int,
     scorer: Optional[Callable],
+    save_activations: bool,
 ):
     """Construct an EditTreeLemmatizer component."""
     return EditTreeLemmatizer(
@@ -73,6 +78,7 @@ def make_edit_tree_lemmatizer(
         overwrite=overwrite,
         top_k=top_k,
         scorer=scorer,
+        save_activations=save_activations,
     )
 
 
@@ -92,6 +98,7 @@ def __init__(
         overwrite: bool = False,
         top_k: int = 1,
         scorer: Optional[Callable] = lemmatizer_score,
+        save_activations: bool = False,
     ):
         """
         Construct an edit tree lemmatizer.
@@ -103,6 +110,7 @@ def __init__(
             frequency in the training data.
         overwrite (bool): overwrite existing lemma annotations.
         top_k (int): try to apply at most the k most probable edit trees.
+        save_activations (bool): save model activations in Doc when annotating.
         """
         self.vocab = vocab
         self.model = model
@@ -117,7 +125,7 @@ def __init__(
 
         self.cfg: Dict[str, Any] = {"labels": []}
         self.scorer = scorer
-        self.numpy_ops = NumpyOps()
+        self.save_activations = save_activations
 
     def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
@@ -146,31 +154,24 @@ def get_loss(
 
         return float(loss), d_scores
 
-    def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
-        if self.top_k == 1:
-            scores2guesses = self._scores2guesses_top_k_equals_1
-        elif self.top_k <= TOP_K_GUARDRAIL:
-            scores2guesses = self._scores2guesses_top_k_greater_1
-        else:
-            scores2guesses = self._scores2guesses_top_k_guardrail
-        # The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
-        # of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
-        # for its principal purpose of lemmatizing tokens. However, the code could also
-        # be used for other purposes, and with very large values of *top_k* the method
-        # becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
-        # instead.
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         n_docs = len(list(docs))
         if not any(len(doc) for doc in docs):
             # Handle cases where there are no tokens in any docs.
             n_labels = len(self.cfg["labels"])
-            guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
+            guesses: List[Ints1d] = [
+                self.model.ops.alloc((0,), dtype="i") for doc in docs
+            ]
+            scores: List[Floats2d] = [
+                self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
+            ]
             assert len(guesses) == n_docs
-            return guesses
+            return {"probabilities": scores, "tree_ids": guesses}
         scores = self.model.predict(docs)
         assert len(scores) == n_docs
         guesses = scores2guesses(docs, scores)
         assert len(guesses) == n_docs
-        return guesses
+        return {"probabilities": scores, "tree_ids": guesses}
 
     def _scores2guesses_top_k_equals_1(self, docs, scores):
         guesses = []
@@ -230,8 +231,13 @@ def _scores2guesses_top_k_guardrail(self, docs, scores):
 
         return guesses
 
-    def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
+        batch_tree_ids = activations["tree_ids"]
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             doc_tree_ids = batch_tree_ids[i]
             if hasattr(doc_tree_ids, "get"):
                 doc_tree_ids = doc_tree_ids.get()
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index a730ece1bfa..bab79282d5b 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,3 +1,10 @@
+from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
+from typing import cast
+from numpy import dtype
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
+from pathlib import Path
+from itertools import islice
+import srsly
 import random
 from itertools import islice
 from pathlib import Path
@@ -21,6 +28,11 @@
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 
+
+ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
+
+KNOWLEDGE_BASE_IDS = "kb_ids"
+
 # See #9050
 BACKWARD_OVERWRITE = True
 
@@ -60,6 +72,7 @@
         "use_gold_ents": True,
         "candidates_batch_size": 1,
         "threshold": None,
+        "save_activations": False,
     },
     default_score_weights={
         "nel_micro_f": 1.0,
@@ -87,6 +100,7 @@ def make_entity_linker(
     use_gold_ents: bool,
     candidates_batch_size: int,
     threshold: Optional[float] = None,
+    save_activations: bool,
 ):
     """Construct an EntityLinker component.
 
@@ -110,6 +124,7 @@ def make_entity_linker(
     candidates_batch_size (int): Size of batches for entity candidate generation.
     threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
         prediction is discarded. If None, predictions are not filtered by any threshold.
+    save_activations (bool): save model activations in Doc when annotating.
     """
 
     if not model.attrs.get("include_span_maker", False):
@@ -144,6 +159,7 @@ def make_entity_linker(
         use_gold_ents=use_gold_ents,
         candidates_batch_size=candidates_batch_size,
         threshold=threshold,
+        save_activations=save_activations,
     )
 
 
@@ -185,6 +201,7 @@ def __init__(
         use_gold_ents: bool,
         candidates_batch_size: int,
         threshold: Optional[float] = None,
+        save_activations: bool = False,
     ) -> None:
         """Initialize an entity linker.
 
@@ -239,6 +256,7 @@ def __init__(
         self.use_gold_ents = use_gold_ents
         self.candidates_batch_size = candidates_batch_size
         self.threshold = threshold
+        self.save_activations = save_activations
 
         if candidates_batch_size < 1:
             raise ValueError(Errors.E1044)
@@ -427,7 +445,7 @@ def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
         loss = loss / len(entity_encodings)
         return float(loss), out
 
-    def predict(self, docs: Iterable[Doc]) -> List[str]:
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
         Returns the KB IDs for each entity in each doc, including NIL if there is
         no prediction.
@@ -440,129 +458,138 @@ def predict(self, docs: Iterable[Doc]) -> List[str]:
         self.validate_kb()
         entity_count = 0
         final_kb_ids: List[str] = []
-        xp = self.model.ops.xp
+        ops = self.model.ops
+        xp = ops.xp
+        docs_ents: List[Ragged] = []
+        docs_scores: List[Ragged] = []
         if not docs:
-            return final_kb_ids
+            return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
         if isinstance(docs, Doc):
             docs = [docs]
-        for i, doc in enumerate(docs):
+        for doc in docs:
+            doc_ents: List[Ints1d] = []
+            doc_scores: List[Floats1d] = []
             if len(doc) == 0:
+                docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
+                docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
                 continue
             sentences = [s for s in doc.sents]
 
-            # Loop over entities in batches.
-            for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
-                ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
-
-                # Look up candidate entities.
-                valid_ent_idx = [
-                    idx
-                    for idx in range(len(ent_batch))
-                    if ent_batch[idx].label_ not in self.labels_discard
-                ]
-
-                batch_candidates = list(
-                    self.get_candidates_batch(
-                        self.kb, [ent_batch[idx] for idx in valid_ent_idx]
-                    )
-                    if self.candidates_batch_size > 1
-                    else [
-                        self.get_candidates(self.kb, ent_batch[idx])
-                        for idx in valid_ent_idx
-                    ]
-                )
-
-                # Looping through each entity in batch (TODO: rewrite)
-                for j, ent in enumerate(ent_batch):
-                    assert hasattr(ent, "sents")
-                    sents = list(ent.sents)
-                    sent_indices = (
-                        sentences.index(sents[0]),
-                        sentences.index(sents[-1]),
+                if self.incl_context:
+                    # get n_neighbour sentences, clipped to the length of the document
+                    start_sentence = max(0, sent_index - self.n_sents)
+                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
+                    start_token = sentences[start_sentence].start
+                    end_token = sentences[end_sentence].end
+                    sent_doc = doc[start_token:end_token].as_doc()
+                    # currently, the context is the same for each entity in a sentence (should be refined)
+                    sentence_encoding = self.model.predict([sent_doc])[0]
+                    sentence_encoding_t = sentence_encoding.T
+                    sentence_norm = xp.linalg.norm(sentence_encoding_t)
+                entity_count += 1
+                if ent.label_ in self.labels_discard:
+                    # ignoring this entity - setting to NIL
+                    final_kb_ids.append(self.NIL)
+                    self._add_activations(
+                        doc_scores=doc_scores,
+                        doc_ents=doc_ents,
+                        scores=[0.0],
+                        ents=[0],
                     )
-                    assert sent_indices[1] >= sent_indices[0] >= 0
-
-                    if self.incl_context:
-                        # get n_neighbour sentences, clipped to the length of the document
-                        start_sentence = max(0, sent_indices[0] - self.n_sents)
-                        end_sentence = min(
-                            len(sentences) - 1, sent_indices[1] + self.n_sents
-                        )
-                        start_token = sentences[start_sentence].start
-                        end_token = sentences[end_sentence].end
-                        sent_doc = doc[start_token:end_token].as_doc()
-
-                        # currently, the context is the same for each entity in a sentence (should be refined)
-                        sentence_encoding = self.model.predict([sent_doc])[0]
-                        sentence_encoding_t = sentence_encoding.T
-                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                    entity_count += 1
-                    if ent.label_ in self.labels_discard:
-                        # ignoring this entity - setting to NIL
+                else:
+                    candidates = list(self.get_candidates(self.kb, ent))
+                    if not candidates:
+                        # no prediction possible for this entity - setting to NIL
                         final_kb_ids.append(self.NIL)
+                        self._add_activations(
+                            doc_scores=doc_scores,
+                            doc_ents=doc_ents,
+                            scores=[0.0],
+                            ents=[0],
+                        )
+                    elif len(candidates) == 1 and self.threshold is None:
+                        # shortcut for efficiency reasons: take the 1 candidate
+                        final_kb_ids.append(candidates[0].entity_)
+                        self._add_activations(
+                            doc_scores=doc_scores,
+                            doc_ents=doc_ents,
+                            scores=[1.0],
+                            ents=[candidates[0].entity_],
+                        )
                     else:
-                        candidates = list(batch_candidates[j])
-                        if not candidates:
-                            # no prediction possible for this entity - setting to NIL
-                            final_kb_ids.append(self.NIL)
-                        elif len(candidates) == 1 and self.threshold is None:
-                            # shortcut for efficiency reasons: take the 1 candidate
-                            final_kb_ids.append(candidates[0].entity_)
-                        else:
-                            random.shuffle(candidates)
-                            # set all prior probabilities to 0 if incl_prior=False
-                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
-                            if not self.incl_prior:
-                                prior_probs = xp.asarray([0.0 for _ in candidates])
-                            scores = prior_probs
-                            # add in similarity from the context
-                            if self.incl_context:
-                                entity_encodings = xp.asarray(
-                                    [c.entity_vector for c in candidates]
-                                )
-                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
-                                if len(entity_encodings) != len(prior_probs):
-                                    raise RuntimeError(
-                                        Errors.E147.format(
-                                            method="predict",
-                                            msg="vectors not of equal length",
-                                        )
+                        random.shuffle(candidates)
+                        # set all prior probabilities to 0 if incl_prior=False
+                        prior_probs = xp.asarray([c.prior_prob for c in candidates])
+                        if not self.incl_prior:
+                            prior_probs = xp.asarray([0.0 for _ in candidates])
+                        scores = prior_probs
+                        # add in similarity from the context
+                        if self.incl_context:
+                            entity_encodings = xp.asarray(
+                                [c.entity_vector for c in candidates]
+                            )
+                            entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+                            if len(entity_encodings) != len(prior_probs):
+                                raise RuntimeError(
+                                    Errors.E147.format(
+                                        method="predict",
+                                        msg="vectors not of equal length",
                                     )
-                                # cosine similarity
-                                sims = xp.dot(entity_encodings, sentence_encoding_t) / (
-                                    sentence_norm * entity_norm
                                 )
-                                if sims.shape != prior_probs.shape:
-                                    raise ValueError(Errors.E161)
-                                scores = prior_probs + sims - (prior_probs * sims)
-                            final_kb_ids.append(
-                                candidates[scores.argmax().item()].entity_
-                                if self.threshold is None
-                                or scores.max() >= self.threshold
-                                else EntityLinker.NIL
+                            # cosine similarity
+                            sims = xp.dot(entity_encodings, sentence_encoding_t) / (
+                                sentence_norm * entity_norm
                             )
-
+                            if sims.shape != prior_probs.shape:
+                                raise ValueError(Errors.E161)
+                            scores = prior_probs + sims - (prior_probs * sims)
+                        final_kb_ids.append(
+                            candidates[scores.argmax().item()].entity_
+                            if self.threshold is None or scores.max() >= self.threshold
+                            else EntityLinker.NIL
+                        )
+                        self._add_activations(
+                            doc_scores=doc_scores,
+                            doc_ents=doc_ents,
+                            scores=scores,
+                            ents=[c.entity for c in candidates],
+                        )
+            self._add_doc_activations(
+                docs_scores=docs_scores,
+                docs_ents=docs_ents,
+                doc_scores=doc_scores,
+                doc_ents=doc_ents,
+            )
         if not (len(final_kb_ids) == entity_count):
             err = Errors.E147.format(
                 method="predict", msg="result variables not of equal length"
             )
             raise RuntimeError(err)
-        return final_kb_ids
+        return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
 
-    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced
+                                 by EntityLinker.predict.
 
         DOCS: https://spacy.io/api/entitylinker#set_annotations
         """
+        kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
         count_ents = len([ent for doc in docs for ent in doc.ents])
         if count_ents != len(kb_ids):
             raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
         i = 0
         overwrite = self.cfg["overwrite"]
-        for doc in docs:
+        for j, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    if act_name != KNOWLEDGE_BASE_IDS:
+                        # We only copy activations that are Ragged.
+                        doc.activations[self.name][act_name] = cast(Ragged, acts[j])
+
             for ent in doc.ents:
                 kb_id = kb_ids[i]
                 i += 1
@@ -661,3 +688,32 @@ def rehearse(self, examples, *, sgd=None, losses=None, **config):
 
     def add_label(self, label):
         raise NotImplementedError
+
+    def _add_doc_activations(
+        self,
+        *,
+        docs_scores: List[Ragged],
+        docs_ents: List[Ragged],
+        doc_scores: List[Floats1d],
+        doc_ents: List[Ints1d],
+    ):
+        if not self.save_activations:
+            return
+        ops = self.model.ops
+        lengths = ops.asarray1i([s.shape[0] for s in doc_scores])
+        docs_scores.append(Ragged(ops.flatten(doc_scores), lengths))
+        docs_ents.append(Ragged(ops.flatten(doc_ents), lengths))
+
+    def _add_activations(
+        self,
+        *,
+        doc_scores: List[Floats1d],
+        doc_ents: List[Ints1d],
+        scores: Sequence[float],
+        ents: Sequence[int],
+    ):
+        if not self.save_activations:
+            return
+        ops = self.model.ops
+        doc_scores.append(ops.asarray1f(scores))
+        doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index bdbe75fd824..cc8f87936b9 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,4 +1,8 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Dict, Iterable, List, Optional, Union
+import srsly
+from thinc.api import SequenceCategoricalCrossentropy, Model, Config
+from thinc.types import Floats2d, Ints1d
 from itertools import islice
 from typing import Callable, Dict, Optional, Union
 
@@ -8,6 +12,12 @@ from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 
+from ..parts_of_speech import IDS as POS_IDS
+from ..symbols import POS
+from ..language import Language
+from ..errors import Errors
+from .pipe import deserialize_config
+from .tagger import ActivationsT, Tagger
 from .. import util
 from ..errors import Errors
 from ..language import Language
@@ -50,8 +60,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "morphologizer",
     assigns=["token.morph", "token.pos"],
-    default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False,
-                    "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0},
+    default_config={
+        "model": DEFAULT_MORPH_MODEL,
+        "overwrite": True,
+        "extend": False,
+        "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
+        "save_activations": False,
+    },
     default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
@@ -62,8 +77,10 @@ def make_morphologizer(
     extend: bool,
     label_smoothing: float,
     scorer: Optional[Callable],
+    save_activations: bool,
 ):
-    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer)
+    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
+                         save_activations=save_activations)
 
 
 def morphologizer_score(examples, **kwargs):
@@ -99,6 +116,7 @@ class Morphologizer(Tagger):
         extend: bool = BACKWARD_EXTEND,
         label_smoothing: float = 0.0,
         scorer: Optional[Callable] = morphologizer_score,
+        save_activations: bool = False,
     ):
         """Initialize a morphologizer.
 
@@ -109,6 +127,7 @@ class Morphologizer(Tagger):
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attributes "pos" and "morph" and
             Scorer.score_token_attr_per_feat for the attribute "morph".
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/morphologizer#init
         """
@@ -129,6 +148,7 @@ class Morphologizer(Tagger):
         }
         self.cfg = dict(sorted(cfg.items()))
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -222,14 +242,15 @@ class Morphologizer(Tagger):
         assert len(label_sample) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=doc_sample, Y=label_sample)
 
-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
 
         DOCS: https://spacy.io/api/morphologizer#set_annotations
         """
+        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
@@ -240,6 +261,10 @@ class Morphologizer(Tagger):
         # to allocate a compatible container out of the iterable.
         labels = tuple(self.labels)
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index df093baa9c6..521afe1d181 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,12 +1,16 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+from typing import Dict, Iterable, Optional, Callable, List, Union
 from itertools import islice
 from typing import Callable, Optional
 
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+import srsly
+from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
 
-from .. import util
+from .tagger import ActivationsT, Tagger
+from ..language import Language
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
@@ -37,11 +41,21 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "senter",
     assigns=["token.is_sent_start"],
-    default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
+    default_config={
+        "model": DEFAULT_SENTER_MODEL,
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.senter_scorer.v1"},
+        "save_activations": False,
+    },
     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
-def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
-    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
+def make_senter(nlp: Language,
+                name: str,
+                model: Model,
+                overwrite: bool,
+                scorer: Optional[Callable],
+                save_activations: bool):
+    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
 
 
 def senter_score(examples, **kwargs):
@@ -71,6 +85,7 @@ class SentenceRecognizer(Tagger):
         *,
         overwrite=BACKWARD_OVERWRITE,
         scorer=senter_score,
+        save_activations: bool = False,
     ):
         """Initialize a sentence recognizer.
 
@@ -80,6 +95,7 @@ class SentenceRecognizer(Tagger):
             losses during training.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_spans for the attribute "sents".
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/sentencerecognizer#init
         """
@@ -89,6 +105,7 @@ class SentenceRecognizer(Tagger):
         self._rehearsal_model = None
         self.cfg = {"overwrite": overwrite}
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -106,19 +123,24 @@ class SentenceRecognizer(Tagger):
     def label_data(self):
         return None
 
-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
 
         DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
         """
+        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
         cdef bint overwrite = self.cfg["overwrite"]
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 08a5478a912..1450bb5d6cb 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,6 +1,8 @@
-from dataclasses import dataclass
-from functools import partial
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
+from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
+from typing import Union
+from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
+from thinc.api import Optimizer
+from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
 
 import numpy
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
@@ -16,6 +18,9 @@
 from ..vocab import Vocab
 from .trainable_pipe import TrainablePipe
 
+ActivationsT = Dict[str, Union[Floats2d, Ragged]]
+
+
 spancat_default_config = """
 [model]
 @architectures = "spacy.SpanCategorizer.v1"
@@ -170,6 +175,7 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester:
         "model": DEFAULT_SPANCAT_MODEL,
         "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
         "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
+        "save_activations": False,
     },
     default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@@ -182,6 +188,7 @@ def make_spancat(
     scorer: Optional[Callable],
     threshold: float,
     max_positive: Optional[int],
+    save_activations: bool,
 ) -> "SpanCategorizer":
     """Create a SpanCategorizer component and configure it for multi-label
     classification to be able to assign multiple labels for each span.
@@ -209,6 +216,7 @@ def make_spancat(
         0.5.
     max_positive (Optional[int]): Maximum number of labels to consider positive
         per span. Defaults to None, indicating no limit.
+        save_activations (bool): save model activations in Doc when annotating.
     """
     return SpanCategorizer(
         nlp.vocab,
@@ -287,6 +295,7 @@ def make_spancat_singlelabel(
         add_negative_label=True,
         threshold=None,
         scorer=scorer,
+        save_activations=save_activations,
     )
 
 
@@ -349,6 +358,7 @@ def __init__(
         max_positive: Optional[int] = None,
         threshold: Optional[float] = 0.5,
         scorer: Optional[Callable] = spancat_score,
+        save_activations: bool = False,
     ) -> None:
         """Initialize the multi-label or multi-class span categorizer.
 
@@ -398,9 +408,7 @@ def __init__(
         self.model = model
         self.name = name
         self.scorer = scorer
-        self.add_negative_label = add_negative_label
-        if not allow_overlap and max_positive is not None and max_positive > 1:
-            raise ValueError(Errors.E1051.format(max_positive=max_positive))
+        self.save_activations = save_activations
 
     @property
     def key(self) -> str:
@@ -458,28 +466,7 @@ def label_data(self) -> List[str]:
         """
         return list(self.labels)
 
-    @property
-    def _label_map(self) -> Dict[str, int]:
-        """RETURNS (Dict[str, int]): The label map."""
-        return {label: i for i, label in enumerate(self.labels)}
-
-    @property
-    def _n_labels(self) -> int:
-        """RETURNS (int): Number of labels."""
-        if self.add_negative_label:
-            return len(self.labels) + 1
-        else:
-            return len(self.labels)
-
-    @property
-    def _negative_label_i(self) -> Union[int, None]:
-        """RETURNS (Union[int, None]): Index of the negative label."""
-        if self.add_negative_label:
-            return len(self.label_data)
-        else:
-            return None
-
-    def predict(self, docs: Iterable[Doc]):
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
 
         docs (Iterable[Doc]): The documents to predict.
@@ -488,11 +475,8 @@ def predict(self, docs: Iterable[Doc]):
         DOCS: https://spacy.io/api/spancategorizer#predict
         """
         indices = self.suggester(docs, ops=self.model.ops)
-        if indices.lengths.sum() == 0:
-            scores = self.model.ops.alloc2f(0, 0)
-        else:
-            scores = self.model.predict((docs, indices))  # type: ignore
-        return indices, scores
+        scores = self.model.predict((docs, indices))  # type: ignore
+        return {"indices": indices, "scores": scores}
 
     def set_candidates(
         self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@@ -512,32 +496,32 @@ def set_candidates(
             for index in candidates.dataXd:
                 doc.spans[candidates_key].append(doc[index[0] : index[1]])
 
-    def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of Doc objects, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        scores: The scores to set, produced by SpanCategorizer.predict.
+        activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
 
         DOCS: https://spacy.io/api/spancategorizer#set_annotations
         """
-        indices, scores = indices_scores
+        labels = self.labels
+
+        indices = activations["indices"]
+        assert isinstance(indices, Ragged)
+        scores = cast(Floats2d, activations["scores"])
+
         offset = 0
         for i, doc in enumerate(docs):
             indices_i = indices[i].dataXd
-            allow_overlap = cast(bool, self.cfg["allow_overlap"])
-            if self.cfg["max_positive"] == 1:
-                doc.spans[self.key] = self._make_span_group_singlelabel(
-                    doc,
-                    indices_i,
-                    scores[offset : offset + indices.lengths[i]],
-                    allow_overlap,
-                )
-            else:
-                doc.spans[self.key] = self._make_span_group_multilabel(
-                    doc,
-                    indices_i,
-                    scores[offset : offset + indices.lengths[i]],
-                )
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                doc.activations[self.name]["indices"] = indices_i
+                doc.activations[self.name]["scores"] = scores[
+                    offset : offset + indices.lengths[i]
+                ]
+            doc.spans[self.key] = self._make_span_group(
+                doc, indices_i, scores[offset : offset + indices.lengths[i]], labels  # type: ignore[arg-type]
+            )
             offset += indices.lengths[i]
 
     def update(
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 34e85d49c2b..8ecd0c46ee0 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,4 +1,10 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Dict, Iterable, List, Optional, Union
+import numpy
+import srsly
+from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
+import warnings
 from itertools import islice
 from typing import Callable, Optional
 
@@ -15,6 +21,9 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .trainable_pipe import TrainablePipe
 
+
+ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
+
 # See #9050
 BACKWARD_OVERWRITE = False
 
@@ -38,7 +47,13 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "tagger",
     assigns=["token.tag"],
-    default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0},
+    default_config={
+        "model": DEFAULT_TAGGER_MODEL,
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.tagger_scorer.v1"},
+        "neg_prefix": "!",
+        "save_activations": False,
+    },
     default_score_weights={"tag_acc": 1.0},
 )
 def make_tagger(
@@ -48,7 +63,7 @@ def make_tagger(
     overwrite: bool,
     scorer: Optional[Callable],
     neg_prefix: str,
-    label_smoothing: float,
+    save_activations: bool,
 ):
     """Construct a part-of-speech tagger component.
 
@@ -57,7 +72,8 @@ def make_tagger(
         in size, and be normalized as probabilities (all scores between 0 and 1,
         with the rows summing to 1).
     """
-    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, label_smoothing=label_smoothing)
+    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
+                  save_activations=save_activations)
 
 
 def tagger_score(examples, **kwargs):
@@ -83,7 +99,7 @@ class Tagger(TrainablePipe):
         overwrite=BACKWARD_OVERWRITE,
         scorer=tagger_score,
         neg_prefix="!",
-        label_smoothing=0.0,
+        save_activations: bool = False,
     ):
         """Initialize a part-of-speech tagger.
 
@@ -93,6 +109,7 @@ class Tagger(TrainablePipe):
             losses during training.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attribute "tag".
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/tagger#init
         """
@@ -103,6 +120,7 @@ class Tagger(TrainablePipe):
         cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix, "label_smoothing": label_smoothing}
         self.cfg = dict(sorted(cfg.items()))
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -121,7 +139,7 @@ class Tagger(TrainablePipe):
         """Data about the labels currently added to the component."""
         return tuple(self.cfg["labels"])
 
-    def predict(self, docs):
+    def predict(self, docs) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
 
         docs (Iterable[Doc]): The documents to predict.
@@ -134,12 +152,12 @@ class Tagger(TrainablePipe):
             n_labels = len(self.labels)
             guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
             assert len(guesses) == len(docs)
-            return guesses
+            return {"probabilities": guesses, "label_ids": guesses}
         scores = self.model.predict(docs)
         assert len(scores) == len(docs), (len(scores), len(docs))
         guesses = self._scores2guesses(scores)
         assert len(guesses) == len(docs)
-        return guesses
+        return {"probabilities": scores, "label_ids": guesses}
 
     def _scores2guesses(self, scores):
         guesses = []
@@ -150,20 +168,25 @@ class Tagger(TrainablePipe):
             guesses.append(doc_guesses)
         return guesses
 
-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by Tagger.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
 
         DOCS: https://spacy.io/api/tagger#set_annotations
         """
+        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
         cdef bint overwrite = self.cfg["overwrite"]
         labels = self.labels
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index ae227017a9f..6cb33109891 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
+from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
+from thinc.types import Floats2d
+import numpy
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
 
@@ -14,6 +18,9 @@
 from ..vocab import Vocab
 from .trainable_pipe import TrainablePipe
 
+ActivationsT = Dict[str, Floats2d]
+
+
 single_label_default_config = """
 [model]
 @architectures = "spacy.TextCatEnsemble.v2"
@@ -80,7 +87,8 @@
     default_config={
         "threshold": 0.0,
         "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
-        "scorer": {"@scorers": "spacy.textcat_scorer.v2"},
+        "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
+        "save_activations": False,
     },
     default_score_weights={
         "cats_score": 1.0,
@@ -101,6 +109,7 @@ def make_textcat(
     model: Model[List[Doc], List[Floats2d]],
     threshold: float,
     scorer: Optional[Callable],
+    save_activations: bool,
 ) -> "TextCategorizer":
     """Create a TextCategorizer component. The text categorizer predicts categories
     over a whole document. It can learn one or more labels, and the labels are considered
@@ -110,8 +119,16 @@ def make_textcat(
         scores for each category.
     threshold (float): Cutoff to consider a prediction "positive".
     scorer (Optional[Callable]): The scoring method.
+    save_activations (bool): save model activations in Doc when annotating.
     """
-    return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
+    return TextCategorizer(
+        nlp.vocab,
+        model,
+        name,
+        threshold=threshold,
+        scorer=scorer,
+        save_activations=save_activations,
+    )
 
 
 def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
@@ -142,6 +159,7 @@ def __init__(
         *,
         threshold: float,
         scorer: Optional[Callable] = textcat_score,
+        save_activations: bool = False,
     ) -> None:
         """Initialize a text categorizer for single-label classification.
 
@@ -167,6 +185,7 @@ def __init__(
         }
         self.cfg = dict(cfg)
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def support_missing_values(self):
@@ -191,7 +210,7 @@ def label_data(self) -> List[str]:
         """
         return self.labels  # type: ignore[return-value]
 
-    def predict(self, docs: Iterable[Doc]):
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
 
         docs (Iterable[Doc]): The documents to predict.
@@ -204,12 +223,12 @@ def predict(self, docs: Iterable[Doc]):
             tensors = [doc.tensor for doc in docs]
             xp = self.model.ops.xp
             scores = xp.zeros((len(list(docs)), len(self.labels)))
-            return scores
+            return {"probabilities": scores}
         scores = self.model.predict(docs)
         scores = self.model.ops.asarray(scores)
-        return scores
+        return {"probabilities": scores}
 
-    def set_annotations(self, docs: Iterable[Doc], scores) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of Doc objects, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
@@ -217,9 +236,13 @@ def set_annotations(self, docs: Iterable[Doc], scores) -> None:
 
         DOCS: https://spacy.io/api/textcategorizer#set_annotations
         """
+        probs = activations["probabilities"]
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                doc.activations[self.name]["probabilities"] = probs[i]
             for j, label in enumerate(self.labels):
-                doc.cats[label] = float(scores[i, j])
+                doc.cats[label] = float(probs[i, j])
 
     def update(
         self,
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 2f8d5e60437..ac024ba3639 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Optional, Dict, List, Callable, Any, Union
+from thinc.types import Floats2d
+from thinc.api import Model, Config
+
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional
 
@@ -78,7 +82,8 @@
     default_config={
         "threshold": 0.5,
         "model": DEFAULT_MULTI_TEXTCAT_MODEL,
-        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
+        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
+        "save_activations": False,
     },
     default_score_weights={
         "cats_score": 1.0,
@@ -99,8 +104,9 @@ def make_multilabel_textcat(
     model: Model[List[Doc], List[Floats2d]],
     threshold: float,
     scorer: Optional[Callable],
-) -> "MultiLabel_TextCategorizer":
-    """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
+    save_activations: bool,
+) -> "TextCategorizer":
+    """Create a TextCategorizer component. The text categorizer predicts categories
     over a whole document. It can learn one or more labels, and the labels are considered
     to be non-mutually exclusive, which means that there can be zero or more labels
     per doc).
@@ -111,7 +117,12 @@ def make_multilabel_textcat(
     scorer (Optional[Callable]): The scoring method.
     """
     return MultiLabel_TextCategorizer(
-        nlp.vocab, model, name, threshold=threshold, scorer=scorer
+        nlp.vocab,
+        model,
+        name,
+        threshold=threshold,
+        scorer=scorer,
+        save_activations=save_activations,
     )
 
 
@@ -143,6 +154,7 @@ def __init__(
         *,
         threshold: float,
         scorer: Optional[Callable] = textcat_multilabel_score,
+        save_activations: bool = False,
     ) -> None:
         """Initialize a text categorizer for multi-label classification.
 
@@ -151,7 +163,7 @@ def __init__(
         name (str): The component instance name, used to add entries to the
             losses during training.
         threshold (float): Cutoff to consider a prediction "positive".
-        scorer (Optional[Callable]): The scoring method.
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/textcategorizer#init
         """
@@ -162,6 +174,7 @@ def __init__(
         cfg = {"labels": [], "threshold": threshold}
         self.cfg = dict(cfg)
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def support_missing_values(self):
diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd
index b1d2550a1ce..3e9a0a9584d 100644
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@@ -7,3 +7,4 @@ cdef class TrainablePipe(Pipe):
     cdef public object model
     cdef public object cfg
     cdef public object scorer
+    cdef bint _save_activations
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 8f219b32797..bd360c9501b 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -2,10 +2,14 @@
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 
 import srsly
-from thinc.api import Model, Optimizer, set_dropout_rate
+from thinc.api import set_dropout_rate, Model, Optimizer
+import warnings
 
 from ..tokens.doc cimport Doc
 
+from ..training import validate_examples
+from ..errors import Errors, Warnings
+from .pipe import Pipe, deserialize_config
 from .. import util
 from ..errors import Errors
 from ..language import Language
@@ -342,3 +346,11 @@ cdef class TrainablePipe(Pipe):
         deserialize["model"] = load_model
         util.from_disk(path, deserialize, exclude)
         return self
+
+    @property
+    def save_activations(self):
+        return self._save_activations
+
+    @save_activations.setter
+    def save_activations(self, save_activations: bool):
+        self._save_activations = save_activations
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index 5a8f0aee2ab..ba2ed4e5ff3 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -1,3 +1,4 @@
+from typing import cast
 import pickle
 
 import hypothesis.strategies as st
@@ -8,6 +9,8 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
+from spacy.pipeline.trainable_pipe import TrainablePipe
+from spacy.training import Example
 from spacy.strings import StringStore
 from spacy.training import Example
 from spacy.util import make_tempdir
@@ -331,3 +334,26 @@ def test_empty_strings():
     no_change = trees.add("xyz", "xyz")
     empty = trees.add("", "")
     assert no_change == empty
+
+
+def test_save_activations():
+    nlp = English()
+    lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer"))
+    lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = lemmatizer.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "trainable_lemmatizer" not in doc.activations
+
+    lemmatizer.save_activations = True
+    doc = nlp("This is a test.")
+    assert list(doc.activations["trainable_lemmatizer"].keys()) == [
+        "probabilities",
+        "tree_ids",
+    ]
+    assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
+    assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 00771a0f0f8..844bacb3b1f 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,7 +1,8 @@
-from typing import Any, Callable, Dict, Iterable, Tuple
+from typing import Callable, Iterable, Dict, Any, cast
 
 import pytest
 from numpy.testing import assert_equal
+from thinc.types import Ragged
 
 from spacy import Language, registry, util
 from spacy.attrs import ENT_KB_ID
@@ -9,8 +10,7 @@
 from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates
 from spacy.lang.en import English
 from spacy.ml import load_kb
-from spacy.ml.models.entity_linker import build_span_maker
-from spacy.pipeline import EntityLinker
+from spacy.pipeline import EntityLinker, TrainablePipe
 from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
@@ -1194,16 +1194,64 @@ def create_kb(vocab):
     assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
 
 
-def test_span_maker_forward_with_empty():
-    """The forward pass of the span maker may have a doc with no entities."""
+def test_save_activations():
     nlp = English()
-    doc1 = nlp("a b c")
-    ent = doc1[0:1]
-    ent.label_ = "X"
-    doc1.ents = [ent]
-    # no entities
-    doc2 = nlp("x y z")
-
-    # just to get a model
-    span_maker = build_span_maker()
-    span_maker([doc1, doc2], False)
+    vector_length = 3
+    assert "Q2146908" not in nlp.vocab.strings
+
+    # Convert the texts to docs to make sure we have doc.ents set for the training examples
+    train_examples = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp(text)
+        train_examples.append(Example.from_dict(doc, annotation))
+
+    def create_kb(vocab):
+        # create artificial KB - assign same prior weight to the two russ cochran's
+        # Q2146908 (Russ Cochran): American golfer
+        # Q7381115 (Russ Cochran): publisher
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
+        mykb.add_alias(
+            alias="Russ Cochran",
+            entities=["Q2146908", "Q7381115"],
+            probabilities=[0.5, 0.5],
+        )
+        return mykb
+
+    # Create the Entity Linker component and add it to the pipeline
+    entity_linker = cast(TrainablePipe, nlp.add_pipe("entity_linker", last=True))
+    assert isinstance(entity_linker, EntityLinker)
+    entity_linker.set_kb(create_kb)
+    assert "Q2146908" in entity_linker.vocab.strings
+    assert "Q2146908" in entity_linker.kb.vocab.strings
+
+    # initialize the NEL pipe
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    nO = entity_linker.model.get_dim("nO")
+
+    nlp.add_pipe("sentencizer", first=True)
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
+        {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]},
+    ]
+    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+    ruler.add_patterns(patterns)
+
+    doc = nlp("Russ Cochran was a publisher")
+    assert "entity_linker" not in doc.activations
+
+    entity_linker.save_activations = True
+    doc = nlp("Russ Cochran was a publisher")
+    assert set(doc.activations["entity_linker"].keys()) == {"ents", "scores"}
+    ents = doc.activations["entity_linker"]["ents"]
+    assert isinstance(ents, Ragged)
+    assert ents.data.shape == (2, 1)
+    assert ents.data.dtype == "uint64"
+    assert ents.lengths.shape == (1,)
+    scores = doc.activations["entity_linker"]["scores"]
+    assert isinstance(scores, Ragged)
+    assert scores.data.shape == (2, 1)
+    assert scores.data.dtype == "float32"
+    assert scores.lengths.shape == (1,)
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 0d895f23688..c2b65977ac3 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -1,3 +1,4 @@
+from typing import cast
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import get_current_ops
@@ -7,7 +8,8 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.morphology import Morphology
-from spacy.tests.util import make_tempdir
+from spacy.pipeline import TrainablePipe
+from spacy.attrs import MORPH
 from spacy.tokens import Doc
 from spacy.training import Example
 
@@ -224,3 +226,25 @@ def test_overfitting_IO():
     gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
     assert [str(t.morph) for t in doc] == gold_morphs
     assert [t.pos_ for t in doc] == gold_pos_tags
+
+
+def test_save_activations():
+    nlp = English()
+    morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
+    train_examples = []
+    for inst in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    doc = nlp("This is a test.")
+    assert "morphologizer" not in doc.activations
+
+    morphologizer.save_activations = True
+    doc = nlp("This is a test.")
+    assert "morphologizer" in doc.activations
+    assert set(doc.activations["morphologizer"].keys()) == {
+        "label_ids",
+        "probabilities",
+    }
+    assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
+    assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 6c76558123f..2e40d86ff48 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -1,3 +1,4 @@
+from typing import cast
 import pytest
 from numpy.testing import assert_equal
 
@@ -5,6 +6,7 @@
 from spacy.attrs import SENT_START
 from spacy.lang.en import English
 from spacy.language import Language
+from spacy.pipeline import TrainablePipe
 from spacy.tests.util import make_tempdir
 from spacy.training import Example
 
@@ -101,3 +103,26 @@ def test_overfitting_IO():
     # test internal pipe labels vs. Language.pipe_labels with hidden labels
     assert nlp.get_pipe("senter").labels == ("I", "S")
     assert "senter" not in nlp.pipe_labels
+
+
+def test_save_activations():
+    # Test if activations are correctly added to Doc when requested.
+    nlp = English()
+    senter = cast(TrainablePipe, nlp.add_pipe("senter"))
+
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = senter.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "senter" not in doc.activations
+
+    senter.save_activations = True
+    doc = nlp("This is a test.")
+    assert "senter" in doc.activations
+    assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
+    assert doc.activations["senter"]["probabilities"].shape == (5, nO)
+    assert doc.activations["senter"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index c143d193fa6..9678e9b63b8 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -594,19 +594,21 @@ def test_set_candidates(name):
     assert docs[0].spans["candidates"][4].text == "Just a"
 
 
-@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
-@pytest.mark.parametrize("n_process", [1, 2])
-def test_spancat_multiprocessing(name, n_process):
-    if isinstance(get_current_ops, NumpyOps) or n_process < 2:
-        nlp = Language()
-        spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
-        train_examples = make_examples(nlp)
-        nlp.initialize(get_examples=lambda: train_examples)
-        texts = [
-            "Just a sentence.",
-            "I like London and Berlin",
-            "I like Berlin",
-            "I eat ham.",
-        ]
-        docs = list(nlp.pipe(texts, n_process=n_process))
-        assert len(docs) == len(texts)
+def test_save_activations():
+    # Test if activations are correctly added to Doc when requested.
+    nlp = English()
+    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
+    train_examples = make_examples(nlp)
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = spancat.model.get_dim("nO")
+    assert nO == 2
+    assert set(spancat.labels) == {"LOC", "PERSON"}
+
+    doc = nlp("This is a test.")
+    assert "spancat" not in doc.activations
+
+    spancat.save_activations = True
+    doc = nlp("This is a test.")
+    assert set(doc.activations["spancat"].keys()) == {"indices", "scores"}
+    assert doc.activations["spancat"]["indices"].shape == (12, 2)
+    assert doc.activations["spancat"]["scores"].shape == (12, nO)
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 4b5f1ee99fc..5deb323dd71 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,3 +1,4 @@
+from typing import cast
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import compounding, get_current_ops
@@ -6,7 +7,8 @@
 from spacy.attrs import TAG
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.training import Example
+from spacy.pipeline import TrainablePipe
+from thinc.api import compounding
 
 from ..util import make_tempdir
 
@@ -235,6 +237,26 @@ def test_overfitting_IO():
     assert doc3[0].tag_ != "N"
 
 
+def test_save_activations():
+    # Test if activations are correctly added to Doc when requested.
+    nlp = English()
+    tagger = cast(TrainablePipe, nlp.add_pipe("tagger"))
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    doc = nlp("This is a test.")
+    assert "tagger" not in doc.activations
+
+    tagger.save_activations = True
+    doc = nlp("This is a test.")
+    assert "tagger" in doc.activations
+    assert set(doc.activations["tagger"].keys()) == {"label_ids", "probabilities"}
+    assert doc.activations["tagger"]["probabilities"].shape == (5, len(TAGS))
+    assert doc.activations["tagger"]["label_ids"].shape == (5,)
+
+
 def test_tagger_requires_labels():
     nlp = English()
     nlp.add_pipe("tagger")
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 8a0c1a9760d..710dac0571d 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,3 +1,4 @@
+from typing import cast
 import random
 
 import numpy.random
@@ -11,17 +12,13 @@
 from spacy.cli.evaluate import print_prf_per_type, print_textcats_auc_per_cat
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import TextCategorizer
-from spacy.pipeline.textcat import (
-    single_label_bow_config,
-    single_label_cnn_config,
-    single_label_default_config,
-)
-from spacy.pipeline.textcat_multilabel import (
-    multi_label_bow_config,
-    multi_label_cnn_config,
-    multi_label_default_config,
-)
+from spacy.pipeline import TextCategorizer, TrainablePipe
+from spacy.pipeline.textcat import single_label_bow_config
+from spacy.pipeline.textcat import single_label_cnn_config
+from spacy.pipeline.textcat import single_label_default_config
+from spacy.pipeline.textcat_multilabel import multi_label_bow_config
+from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
+from spacy.pipeline.textcat_multilabel import multi_label_default_config
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, DocBin
@@ -298,7 +295,7 @@ def test_issue9904():
     nlp.initialize(get_examples)
 
     examples = get_examples()
-    scores = textcat.predict([eg.predicted for eg in examples])
+    scores = textcat.predict([eg.predicted for eg in examples])["probabilities"]
 
     loss = textcat.get_loss(examples, scores)[0]
     loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0]
@@ -949,24 +946,39 @@ def test_textcat_multi_threshold():
     assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
 
 
-@pytest.mark.parametrize(
-    "component_name,scorer",
-    [
-        ("textcat", "spacy.textcat_scorer.v1"),
-        ("textcat_multilabel", "spacy.textcat_multilabel_scorer.v1"),
-    ],
-)
-def test_textcat_legacy_scorers(component_name, scorer):
-    """Check that legacy scorers are registered and produce the expected score
-    keys."""
+def test_save_activations():
     nlp = English()
-    nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}})
+    textcat = cast(TrainablePipe, nlp.add_pipe("textcat"))
 
     train_examples = []
     for text, annotations in TRAIN_DATA_SINGLE_LABEL:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
     nlp.initialize(get_examples=lambda: train_examples)
+    nO = textcat.model.get_dim("nO")
 
-    # score the model (it's not actually trained but that doesn't matter)
-    scores = nlp.evaluate(train_examples)
-    assert 0 <= scores["cats_score"] <= 1
+    doc = nlp("This is a test.")
+    assert "textcat" not in doc.activations
+
+    textcat.save_activations = True
+    doc = nlp("This is a test.")
+    assert list(doc.activations["textcat"].keys()) == ["probabilities"]
+    assert doc.activations["textcat"]["probabilities"].shape == (nO,)
+
+
+def test_save_activations_multi():
+    nlp = English()
+    textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel"))
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA_MULTI_LABEL:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = textcat.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "textcat_multilabel" not in doc.activations
+
+    textcat.save_activations = True
+    doc = nlp("This is a test.")
+    assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
+    assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index d9719609cdc..5e8975ed337 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -50,6 +50,8 @@ cdef class Doc:
 
     cdef public float sentiment
 
+    cdef public dict activations
+
     cdef public dict user_hooks
     cdef public dict user_token_hooks
     cdef public dict user_span_hooks
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 0fae118b4b6..5fda6f2f789 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -16,7 +16,7 @@ from typing import (
 
 import numpy as np
 from cymem.cymem import Pool
-from thinc.types import Floats1d, Floats2d, Ints2d
+from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
 from .span import Span
 from .token import Token
 from .span_groups import SpanGroups
@@ -41,6 +41,7 @@ class Doc:
     max_length: int
     length: int
     sentiment: float
+    activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
     cats: Dict[str, float]
     user_hooks: Dict[str, Callable[..., Any]]
     user_token_hooks: Dict[str, Callable[..., Any]]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 8db8c1d6f37..497656b6570 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -281,6 +281,7 @@ cdef class Doc:
         self.length = 0
         self.sentiment = 0.0
         self.cats = {}
+        self.activations = {}
         self.user_hooks = {}
         self.user_token_hooks = {}
         self.user_span_hooks = {}
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index 0a582650076..310ce0dc88d 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -752,22 +752,23 @@ The L2 norm of the document's vector representation.
 
 ## Attributes {id="attributes"}
 
-| Name                 | Description                                                                                                                         |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `text`               | A string representation of the document text. ~~str~~                                                                               |
-| `text_with_ws`       | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                       |
-| `mem`                | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                            |
-| `vocab`              | The store of lexical types. ~~Vocab~~                                                                                               |
-| `tensor`             | Container for dense vector representations. ~~numpy.ndarray~~                                                                       |
-| `user_data`          | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                    |
-| `lang`               | Language of the document's vocabulary. ~~int~~                                                                                      |
-| `lang_`              | Language of the document's vocabulary. ~~str~~                                                                                      |
-| `sentiment`          | The document's positivity/negativity score, if available. ~~float~~                                                                 |
-| `user_hooks`         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                           |
-| `user_token_hooks`   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                   |
-| `user_span_hooks`    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                    |
-| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
-| `_`                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~       |
+| Name                                       | Description                                                                                                                                    |
+| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `text`                                     | A string representation of the document text. ~~str~~                                                                                          |
+| `text_with_ws`                             | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                                  |
+| `mem`                                      | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                       |
+| `vocab`                                    | The store of lexical types. ~~Vocab~~                                                                                                          |
+| `tensor` <Tag variant="new">2</Tag>        | Container for dense vector representations. ~~numpy.ndarray~~                                                                                  |
+| `user_data`                                | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                               |
+| `lang` <Tag variant="new">2.1</Tag>        | Language of the document's vocabulary. ~~int~~                                                                                                 |
+| `lang_` <Tag variant="new">2.1</Tag>       | Language of the document's vocabulary. ~~str~~                                                                                                 |
+| `sentiment`                                | The document's positivity/negativity score, if available. ~~float~~                                                                            |
+| `user_hooks`                               | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                      |
+| `user_token_hooks`                         | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                              |
+| `user_span_hooks`                          | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                               |
+| `has_unknown_spaces`                       | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~            |
+| `_`                                        | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                  |
+| `activations` <Tag variant="new">4.0</Tag> | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ |
 
 ## Serialization fields {id="serialization-fields"}
 
diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx
index 82967482c90..17af19e8c38 100644
--- a/website/docs/api/edittreelemmatizer.mdx
+++ b/website/docs/api/edittreelemmatizer.mdx
@@ -44,14 +44,15 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("trainable_lemmatizer", config=config, name="lemmatizer")
 > ```
 
-| Setting         | Description                                                                                                                                                                                                                                                                                                        |
-| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`         | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `backoff`       | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~                                                                                                                                                                                                                      |
-| `min_tree_freq` | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~                                                                                                                                                                                                                         |
-| `overwrite`     | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                                          |
-| `top_k`         | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~                                                                                                                                                                                                              |
-| `scorer`        | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~                                                                                                                                                                      |
+| Setting                                         | Description                                                                                                                                                                                                                                                                                                        |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                         | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `backoff`                                       | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~                                                                                                                                                                                                                      |
+| `min_tree_freq`                                 | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~                                                                                                                                                                                                                         |
+| `overwrite`                                     | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                                          |
+| `top_k`                                         | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~                                                                                                                                                                                                              |
+| `scorer`                                        | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~                                                                                                                                                                      |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"tree_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                                                                    |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/edit_tree_lemmatizer.py
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index 21d2e9015ce..85b872151fd 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -53,21 +53,20 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("entity_linker", config=config)
 > ```
 
-| Setting                                             | Description                                                                                                                                                                                                                                                                                                      |
-| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels_discard`                                    | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                                                   |
-| `n_sents`                                           | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                                                |
-| `incl_prior`                                        | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                             |
-| `incl_context`                                      | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                           |
-| `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           |
-| `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    |
-| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                                             |
-| `get_candidates`                                    | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                         |
-| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
-| `generate_empty_kb` <Tag variant="new">3.5.1</Tag>  | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           |
-| `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         |
-| `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          |
-| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                      |
+| Setting                                         | Description                                                                                                                                                                                                                                                                                 |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `labels_discard`                                | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                              |
+| `n_sents`                                       | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                           |
+| `incl_prior`                                    | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                        |
+| `incl_context`                                  | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                      |
+| `model`                                         | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                      |
+| `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
+| `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
+| `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                    |
+| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
+| `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index ce16f534219..1fda807cb32 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -42,13 +42,13 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("morphologizer", config=config)
 > ```
 
-| Setting                                        | Description                                                                                                                                                                                                                                                            |
-| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model`                                        | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
-| `overwrite` <Tag variant="new">3.2</Tag>       | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
-| `extend` <Tag variant="new">3.2</Tag>          | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
-| `scorer` <Tag variant="new">3.2</Tag>          | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
-| `label_smoothing` <Tag variant="new">3.6</Tag> | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~                                                                                                                                                                               |
+| Setting                                         | Description                                                                                                                                                                                                                                                            |
+| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model`                                         | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
+| `extend` <Tag variant="new">3.2</Tag>           | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
+| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                       |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx
@@ -400,8 +400,8 @@ coarse-grained POS as the feature `POS`.
 > assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels
 > ```
 
-| Name        | Description                                            |
-| ----------- | ------------------------------------------------------ |
+| Name        | Description                                               |
+| ----------- | --------------------------------------------------------- |
 | **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ |
 
 ## Morphologizer.label_data {id="label_data",tag="property",version="3"}
diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx
index 5435399f956..d5d096d7659 100644
--- a/website/docs/api/sentencerecognizer.mdx
+++ b/website/docs/api/sentencerecognizer.mdx
@@ -39,11 +39,12 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("senter", config=config)
 > ```
 
-| Setting                                  | Description                                                                                                                                                           |
-| ---------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                             |
-| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~                                   |
+| Setting                                         | Description                                                                                                                                                           |
+| ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model`                                         | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                             |
+| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~                                   |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                      |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/senter.pyx
diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx
index 98a1948eeab..258db794786 100644
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@@ -62,32 +62,15 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("spancat", config=config)
 > ```
 
-> #### Example (spancat_singlelabel)
->
-> ```python
-> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
-> config = {
->     "spans_key": "labeled_spans",
->     "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
->     "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
->     # Additional spancat_singlelabel parameters
->     "negative_weight": 0.8,
->     "allow_overlap": True,
-> }
-> nlp.add_pipe("spancat_singlelabel", config=config)
-> ```
-
-| Setting                                             | Description                                                                                                                                                                                                                                                                                             |
-| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `suggester`                                         | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~                                                  |
-| `model`                                             | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
-| `spans_key`                                         | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~                                                                                  |
-| `threshold`                                         | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Meant to be used in combination with the multi-class `spancat` component with a `Logistic` scoring layer. Defaults to `0.5`. ~~float~~                                                |
-| `max_positive`                                      | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. Meant to be used together with the `spancat` component and defaults to 0 with `spancat_singlelabel`. ~~Optional[int]~~                                                                                 |
-| `scorer`                                            | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~                                                                                                                                       |
-| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span` . This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel`. Spans with negative labels and their scores are not stored as annotations. ~~bool~~            |
-| `negative_weight` <Tag variant="new">3.5.1</Tag>    | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~                                                                                                               |
-| `allow_overlap` <Tag variant="new">3.5.1</Tag>      | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~                                                                                                                                                        |
+| Setting                                         | Description                                                                                                                                                                                                                                                                                             |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `suggester`                                     | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~                                                  |
+| `model`                                         | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
+| `spans_key`                                     | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~                                                                                  |
+| `threshold`                                     | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~                                                                                                                                                          |
+| `max_positive`                                  | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~                                                                                                                                                                                      |
+| `scorer`                                        | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~                                                                                                                                       |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"indices"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                                 |
 
 <Infobox variant="warning">
 
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index d9b0506fb17..20852e8eb94 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -40,13 +40,13 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("tagger", config=config)
 > ```
 
-| Setting                                        | Description                                                                                                                                                                                                                                                                                            |
-| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                                        | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `overwrite` <Tag variant="new">3.2</Tag>       | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                              |
-| `scorer` <Tag variant="new">3.2</Tag>          | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~                                                                                                                                                            |
-| `neg_prefix` <Tag variant="new">3.2.1</Tag>    | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~                                                                                                                                                              |
-| `label_smoothing` <Tag variant="new">3.6</Tag> | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~                                                                                                                                                                                                               |
+| Setting                                         | Description                                                                                                                                                                                                                                                                                            |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                         | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                              |
+| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~                                                                                                                                                            |
+| `neg_prefix` <Tag variant="new">3.2.1</Tag>     | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~                                                                                                                                                              |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                                                       |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/tagger.pyx
diff --git a/website/docs/api/textcategorizer.mdx b/website/docs/api/textcategorizer.mdx
index a259b7b3c65..a1dfb6dd88e 100644
--- a/website/docs/api/textcategorizer.mdx
+++ b/website/docs/api/textcategorizer.mdx
@@ -116,14 +116,15 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#create_pipe).
 
-| Name           | Description                                                                                                                      |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                                 |
-| `model`        | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~       |
-| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
-| _keyword-only_ |                                                                                                                                  |
-| `threshold`    | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~        |
-| `scorer`       | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
+| Name                                            | Description                                                                                                                      |
+| ----------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                         | The shared vocabulary. ~~Vocab~~                                                                                                 |
+| `model`                                         | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~       |
+| `name`                                          | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
+| _keyword-only_                                  |                                                                                                                                  |
+| `threshold`                                     | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                   |
+| `scorer`                                        | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. The supported activations is `"probabilities"`. ~~Union[bool, list[str]]~~            |
 
 ## TextCategorizer.\_\_call\_\_ {id="call",tag="method"}
 

From 41f349c937a3e7b26f7ff390c0ec264a231baa90 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 15 Sep 2022 17:06:58 +0200
Subject: [PATCH 016/504] disable mypy run for Python 3.10 (#11508) (#11512)

---
 .github/azure-steps.yml | 117 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 .github/azure-steps.yml

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
new file mode 100644
index 00000000000..c7722391fec
--- /dev/null
+++ b/.github/azure-steps.yml
@@ -0,0 +1,117 @@
+parameters:
+  python_version: ''
+  architecture: ''
+  prefix: ''
+  gpu: false
+  num_build_jobs: 1
+
+steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: ${{ parameters.python_version }}
+      architecture: ${{ parameters.architecture }}
+
+  - bash: |
+      echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
+    displayName: 'Set variables'
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U pip setuptools
+      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+    displayName: "Install dependencies"
+
+  - script: |
+      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
+      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
+    displayName: "Compile and build sdist"
+
+  - script: python -m mypy spacy
+    displayName: 'Run mypy'
+    condition: ne(variables['python_version'], '3.10')
+
+  - task: DeleteFiles@1
+    inputs:
+      contents: "spacy"
+    displayName: "Delete source directory"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
+      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+    displayName: "Uninstall all packages"
+
+  - bash: |
+      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+      ${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
+    displayName: "Install from sdist"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+    displayName: "Install test requirements"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
+      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
+    displayName: "Install GPU requirements"
+    condition: eq(${{ parameters.gpu }}, true)
+
+  - script: |
+      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
+    displayName: "Run CPU tests"
+    condition: eq(${{ parameters.gpu }}, false)
+
+  - script: |
+      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
+    displayName: "Run GPU tests"
+    condition: eq(${{ parameters.gpu }}, true)
+
+  - script: |
+      python -m spacy download ca_core_news_sm
+      python -m spacy download ca_core_news_md
+      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+    displayName: 'Test download CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
+    displayName: 'Test convert CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -m spacy init config -p ner -l ca ner.cfg
+      python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
+    displayName: 'Test debug config CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      # will have errors due to sparse data, check for summary in output
+      python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
+    displayName: 'Test debug data CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
+    displayName: 'Test train CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+    displayName: 'Test assemble CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+    displayName: 'Test assemble CLI vectors warning'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python .github/validate_universe_json.py website/meta/universe.json
+    displayName: 'Test website/meta/universe.json'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
+      ${{ parameters.prefix }} python -m pytest --pyargs spacy
+    displayName: "Run CPU tests with thinc-apple-ops"
+    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))

From 9dfb4e1217573d36f80b6a5041008dd03389915a Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 3 Oct 2022 14:41:15 +0200
Subject: [PATCH 017/504] fix test for EL activations with refactored KB

---
 spacy/tests/pipeline/test_entity_linker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 844bacb3b1f..80b6e766347 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1209,7 +1209,7 @@ def create_kb(vocab):
         # create artificial KB - assign same prior weight to the two russ cochran's
         # Q2146908 (Russ Cochran): American golfer
         # Q7381115 (Russ Cochran): publisher
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
         mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
         mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
         mykb.add_alias(

From c83a934cd6b3d6a878895287d69376bd4e7bd2d8 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 6 Oct 2022 10:51:06 +0200
Subject: [PATCH 018/504] `StringStore` refactoring (#11344)

* `strings`: Remove unused `hash32_utf8` function

* `strings`: Make `hash_utf8` and `decode_Utf8Str` private

* `strings`: Reorganize private functions

* 'strings': Raise error when non-string/-int types are passed to functions that don't accept them

* `strings`: Add `items()` method, add type hints, remove unused methods, restrict inputs to specific types, reorganize methods

* `Morphology`: Use `StringStore.items()` to enumerate features when pickling

* `test_stringstore`: Update pre-Python 3 tests

* Update `StringStore` docs

* Fix `get_string_id` imports

* Replace redundant test with tests for type checking

* Rename `_retrieve_interned_str`, remove `.get` default arg

* Add `get_string_id` to `strings.pyi`
Remove `mypy` ignore directives from imports of the above

* `strings.pyi`: Replace functions that consume `Union`-typed params with overloads

* `strings.pyi`: Revert some function signatures

* Update `SYMBOLS_BY_INT` lookups and error codes post-merge

* Revert clobbered change introduced in a previous merge

* Remove unnecessary type hint

* Invert tuple order in `StringStore.items()`

* Add test for `StringStore.items()`

* Revert "`Morphology`: Use `StringStore.items()` to enumerate features when pickling"

This reverts commit 1af9510ceb6b08cfdcfbf26df6896f26709fac0d.

* Rename `keys` and `key_map`

* Add `keys()` and `values()`

* Add comment about the inverted key-value semantics in the API

* Fix type hints

* Implement `keys()`, `values()`, `items()` without generators

* Fix type hints, remove unnecessary boxing

* Update docs

* Simplify `keys/values/items()` impl

* `mypy` fix

* Fix error message, doc fixes
---
 spacy/errors.py                               |   4 +-
 spacy/matcher/matcher.pyx                     |   3 +
 spacy/strings.pxd                             |  22 +-
 spacy/strings.pyi                             |  22 +-
 spacy/strings.pyx                             | 410 +++++++++---------
 spacy/tests/vocab_vectors/test_stringstore.py |  41 +-
 spacy/tokens/graph.pyx                        |   4 +-
 spacy/tokens/retokenizer.pyx                  |   4 +-
 website/docs/api/stringstore.mdx              |  82 +++-
 9 files changed, 334 insertions(+), 258 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 146c60b6d60..9814679eb7d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -258,7 +258,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
     E016 = ("MultitaskObjective target should be function or one of: dep, "
             "tag, ent, dep_tag_offset, ent_tag.")
-    E017 = ("Can only add unicode or bytes. Got type: {value_type}")
+    E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
     E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
             "refers to an issue with the `Vocab` or `StringStore`.")
     E019 = ("Can't create transition with unknown action ID: {action}. Action "
@@ -991,6 +991,8 @@ class Errors(metaclass=ErrorsWithCodes):
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
+    E4001 = ("Expected input to be one of the following types: ({expected_types}), "
+             "but got '{received_type}'")
 
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 715dd45f07c..7e734ac247e 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -20,6 +20,9 @@ from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
 
+from ..schemas import validate_token_pattern
+from ..errors import Errors, MatchPatternError, Warnings
+from ..strings cimport get_string_id
 from ..attrs import IDS
 from ..errors import Errors, MatchPatternError, Warnings
 from ..schemas import validate_token_pattern
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index d22f48ba133..b734a707c54 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,3 +1,6 @@
+from libc.stdint cimport int64_t, uint32_t
+from libcpp.vector cimport vector
+from libcpp.set cimport set
 from cymem.cymem cimport Pool
 from libc.stdint cimport int64_t
 from libcpp.set cimport set
@@ -7,13 +10,6 @@ from preshed.maps cimport PreshMap
 
 from .typedefs cimport attr_t, hash_t
 
-
-cpdef hash_t hash_string(str string) except 0
-cdef hash_t hash_utf8(char* utf8_string, int length) nogil
-
-cdef str decode_Utf8Str(const Utf8Str* string)
-
-
 ctypedef union Utf8Str:
     unsigned char[8] s
     unsigned char* p
@@ -21,9 +17,13 @@ ctypedef union Utf8Str:
 
 cdef class StringStore:
     cdef Pool mem
+    cdef vector[hash_t] _keys
+    cdef PreshMap _map
+
+    cdef hash_t _intern_str(self, str string)
+    cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *
+    cdef str _decode_str_repr(self, const Utf8Str* string)
 
-    cdef vector[hash_t] keys
-    cdef public PreshMap _map
 
-    cdef const Utf8Str* intern_unicode(self, str py_string)
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
+cpdef hash_t hash_string(object string) except -1
+cpdef hash_t get_string_id(object string_or_hash) except -1
diff --git a/spacy/strings.pyi b/spacy/strings.pyi
index f8fe8381c87..8b7c0d6bd5a 100644
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@@ -1,21 +1,21 @@
+from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overload
 from pathlib import Path
 from typing import Any, Iterable, Iterator, Optional, Union, overload
 
-def get_string_id(key: Union[str, int]) -> int: ...
-
 class StringStore:
-    def __init__(
-        self, strings: Optional[Iterable[str]] = ..., freeze: bool = ...
-    ) -> None: ...
+    def __init__(self, strings: Optional[Iterable[str]]) -> None: ...
     @overload
-    def __getitem__(self, string_or_id: Union[bytes, str]) -> int: ...
+    def __getitem__(self, string_or_hash: str) -> int: ...
     @overload
-    def __getitem__(self, string_or_id: int) -> str: ...
-    def as_int(self, key: Union[bytes, str, int]) -> int: ...
-    def as_string(self, key: Union[bytes, str, int]) -> str: ...
+    def __getitem__(self, string_or_hash: int) -> str: ...
+    def as_int(self, string_or_hash: Union[str, int]) -> int: ...
+    def as_string(self, string_or_hash: Union[str, int]) -> str: ...
     def add(self, string: str) -> int: ...
+    def items(self) -> List[Tuple[str, int]]: ...
+    def keys(self) -> List[str]: ...
+    def values(self) -> List[int]: ...
     def __len__(self) -> int: ...
-    def __contains__(self, string: str) -> bool: ...
+    def __contains__(self, string_or_hash: Union[str, int]) -> bool: ...
     def __iter__(self) -> Iterator[str]: ...
     def __reduce__(self) -> Any: ...
     def to_disk(self, path: Union[str, Path]) -> None: ...
@@ -23,3 +23,5 @@ class StringStore:
     def to_bytes(self, **kwargs: Any) -> bytes: ...
     def from_bytes(self, bytes_data: bytes, **kwargs: Any) -> StringStore: ...
     def _reset_and_load(self, strings: Iterable[str]) -> None: ...
+
+def get_string_id(string_or_hash: Union[str, int]) -> int: ...
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index e73b66dff54..73e4c46ed46 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,9 +1,8 @@
 # cython: infer_types=True
-# cython: profile=False
+from typing import Optional, Union, Iterable, Tuple, Callable, Any, List, Iterator
 cimport cython
 from libc.stdint cimport uint32_t
-from libc.string cimport memcpy
-from murmurhash.mrmr cimport hash32, hash64
+from murmurhash.mrmr cimport hash64
 
 import srsly
 
@@ -15,105 +14,13 @@ from .symbols import IDS as SYMBOLS_BY_STR
 from .symbols import NAMES as SYMBOLS_BY_INT
 
 
-# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
-cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
-    try:
-        out_hash[0] = key
-        return True
-    except:  # no-cython-lint
-        return False
-
-
-def get_string_id(key):
-    """Get a string ID, handling the reserved symbols correctly. If the key is
-    already an ID, return it.
-
-    This function optimises for convenience over performance, so shouldn't be
-    used in tight loops.
-    """
-    cdef hash_t str_hash    
-    if isinstance(key, str):
-        if len(key) == 0:
-            return 0
-
-        symbol = SYMBOLS_BY_STR.get(key, None)
-        if symbol is not None:
-            return symbol
-        else:
-            chars = key.encode("utf8")
-            return hash_utf8(chars, len(chars))
-    elif _try_coerce_to_hash(key, &str_hash):
-        # Coerce the integral key to the expected primitive hash type.
-        # This ensures that custom/overloaded "primitive" data types
-        # such as those implemented by numpy are not inadvertently used 
-        # downsteam (as these are internally implemented as custom PyObjects 
-        # whose comparison operators can incur a significant overhead).
-        return str_hash
-    else:
-        # TODO: Raise an error instead
-        return key
-
-
-cpdef hash_t hash_string(str string) except 0:
-    chars = string.encode("utf8")
-    return hash_utf8(chars, len(chars))
-
-
-cdef hash_t hash_utf8(char* utf8_string, int length) nogil:
-    return hash64(utf8_string, length, 1)
-
-
-cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
-    return hash32(utf8_string, length, 1)
-
-
-cdef str decode_Utf8Str(const Utf8Str* string):
-    cdef int i, length
-    if string.s[0] < sizeof(string.s) and string.s[0] != 0:
-        return string.s[1:string.s[0]+1].decode("utf8")
-    elif string.p[0] < 255:
-        return string.p[1:string.p[0]+1].decode("utf8")
-    else:
-        i = 0
-        length = 0
-        while string.p[i] == 255:
-            i += 1
-            length += 255
-        length += string.p[i]
-        i += 1
-        return string.p[i:length + i].decode("utf8")
-
-
-cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
-    cdef int n_length_bytes
-    cdef int i
-    cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
-    if length < sizeof(string.s):
-        string.s[0] = <unsigned char>length
-        memcpy(&string.s[1], chars, length)
-        return string
-    elif length < 255:
-        string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
-        string.p[0] = length
-        memcpy(&string.p[1], chars, length)
-        return string
-    else:
-        i = 0
-        n_length_bytes = (length // 255) + 1
-        string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
-        for i in range(n_length_bytes-1):
-            string.p[i] = 255
-        string.p[n_length_bytes-1] = length % 255
-        memcpy(&string.p[n_length_bytes], chars, length)
-        return string
-
 
 cdef class StringStore:
-    """Look up strings by 64-bit hashes.
+    """Look up strings by 64-bit hashes. Implicitly handles reserved symbols.
 
     DOCS: https://spacy.io/api/stringstore
     """
-    def __init__(self, strings=None, freeze=False):
+    def __init__(self, strings: Optional[Iterable[str]] = None):
         """Create the StringStore.
 
         strings (iterable): A sequence of unicode strings to add to the store.
@@ -124,127 +31,126 @@ cdef class StringStore:
             for string in strings:
                 self.add(string)
 
-    def __getitem__(self, object string_or_id):
-        """Retrieve a string from a given hash, or vice versa.
+    def __getitem__(self, string_or_hash: Union[str, int]) -> Union[str, int]:
+        """Retrieve a string from a given hash. If a string
+        is passed as the input, add it to the store and return
+        its hash.
 
-        string_or_id (bytes, str or uint64): The value to encode.
-        Returns (str / uint64): The value to be retrieved.
+        string_or_hash (int / str): The hash value to lookup or the string to store.
+        RETURNS (str / int): The stored string or the hash of the newly added string.
         """
-        cdef hash_t str_hash
-        cdef Utf8Str* utf8str = NULL
-
-        if isinstance(string_or_id, str):
-            if len(string_or_id) == 0:
-                return 0
-
-            # Return early if the string is found in the symbols LUT.
-            symbol = SYMBOLS_BY_STR.get(string_or_id, None)
-            if symbol is not None:
-                return symbol
-            else:
-                return hash_string(string_or_id)
-        elif isinstance(string_or_id, bytes):
-            return hash_utf8(string_or_id, len(string_or_id))
-        elif _try_coerce_to_hash(string_or_id, &str_hash):
-            if str_hash == 0:
-                return ""
-            elif str_hash in SYMBOLS_BY_INT:
-                return SYMBOLS_BY_INT[str_hash]
-            else:
-                utf8str = <Utf8Str*>self._map.get(str_hash)
+        if isinstance(string_or_hash, str):
+            return self.add(string_or_hash)
         else:
-            # TODO: Raise an error instead
-            utf8str = <Utf8Str*>self._map.get(string_or_id)
+            return self._get_interned_str(string_or_hash)
 
-        if utf8str is NULL:
-            raise KeyError(Errors.E018.format(hash_value=string_or_id))
-        else:
-            return decode_Utf8Str(utf8str)
+    def __contains__(self, string_or_hash: Union[str, int]) -> bool:
+        """Check whether a string or a hash is in the store.
 
-    def as_int(self, key):
-        """If key is an int, return it; otherwise, get the int value."""
-        if not isinstance(key, str):
-            return key
+        string (str / int): The string/hash to check.
+        RETURNS (bool): Whether the store contains the string.
+        """
+        cdef hash_t str_hash = get_string_id(string_or_hash)
+        if str_hash in SYMBOLS_BY_INT:
+            return True
         else:
-            return self[key]
+            return self._map.get(str_hash) is not NULL
 
-    def as_string(self, key):
-        """If key is a string, return it; otherwise, get the string value."""
-        if isinstance(key, str):
-            return key
-        else:
-            return self[key]
+    def __iter__(self) -> Iterator[str]:
+        """Iterate over the strings in the store in insertion order.
+
+        RETURNS: An iterable collection of strings.
+        """
+        return iter(self.keys())
+
+    def __reduce__(self):
+        strings = list(self)
+        return (StringStore, (strings,), None, None, None)
+
+    def __len__(self) -> int:
+        """The number of strings in the store.
+
+        RETURNS (int): The number of strings in the store.
+        """
+        return self._keys.size()
 
-    def add(self, string):
+    def add(self, string: str) -> int:
         """Add a string to the StringStore.
 
         string (str): The string to add.
         RETURNS (uint64): The string's hash value.
         """
-        cdef hash_t str_hash
-        if isinstance(string, str):
-            if string in SYMBOLS_BY_STR:
-                return SYMBOLS_BY_STR[string]
-
-            string = string.encode("utf8")
-            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
-        elif isinstance(string, bytes):
-            if string in SYMBOLS_BY_STR:
-                return SYMBOLS_BY_STR[string]
-            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
-        else:
+        if not isinstance(string, str):
             raise TypeError(Errors.E017.format(value_type=type(string)))
-        return str_hash
 
-    def __len__(self):
-        """The number of strings in the store.
+        if string in SYMBOLS_BY_STR:
+            return SYMBOLS_BY_STR[string]
+        else:
+            return self._intern_str(string)
 
-        RETURNS (int): The number of strings in the store.
+    def as_int(self, string_or_hash: Union[str, int]) -> str:
+        """If a hash value is passed as the input, return it as-is. If the input
+        is a string, return its corresponding hash.
+
+        string_or_hash (str / int): The string to hash or a hash value.
+        RETURNS (int): The hash of the string or the input hash value.
         """
-        return self.keys.size()
+        if isinstance(string_or_hash, int):
+            return string_or_hash
+        else:
+            return get_string_id(string_or_hash)
 
-    def __contains__(self, string_or_id not None):
-        """Check whether a string or ID is in the store.
+    def as_string(self, string_or_hash: Union[str, int]) -> str:
+        """If a string is passed as the input, return it as-is. If the input
+        is a hash value, return its corresponding string.
 
-        string_or_id (str or int): The string to check.
-        RETURNS (bool): Whether the store contains the string.
+        string_or_hash (str / int): The hash value to lookup or a string.
+        RETURNS (str): The stored string or the input string.
         """
-        cdef hash_t str_hash
-        if isinstance(string_or_id, str):
-            if len(string_or_id) == 0:
-                return True
-            elif string_or_id in SYMBOLS_BY_STR:
-                return True
-            str_hash = hash_string(string_or_id)
-        elif _try_coerce_to_hash(string_or_id, &str_hash):
-            pass
+        if isinstance(string_or_hash, str):
+            return string_or_hash
         else:
-            # TODO: Raise an error instead
-            return self._map.get(string_or_id) is not NULL
+            return self._get_interned_str(string_or_hash)
 
-        if str_hash in SYMBOLS_BY_INT:
-            return True
-        else:
-            return self._map.get(str_hash) is not NULL
+    def items(self) -> List[Tuple[str, int]]:
+        """Iterate over the stored strings and their hashes in insertion order.
 
-    def __iter__(self):
-        """Iterate over the strings in the store, in order.
+        RETURNS: A list of string-hash pairs.
+        """
+        # Even though we internally store the hashes as keys and the strings as
+        # values, we invert the order in the public API to keep it consistent with
+        # the implementation of the `__iter__` method (where we wish to iterate over
+        # the strings in the store).
+        cdef int i
+        pairs = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            str_hash = self._keys[i]
+            utf8str = <Utf8Str*>self._map.get(str_hash)
+            pairs[i] = (self._decode_str_repr(utf8str), str_hash)
+        return pairs
+
+    def keys(self) -> List[str]:
+        """Iterate over the stored strings in insertion order.
 
-        YIELDS (str): A string in the store.
+        RETURNS: A list of strings.
         """
         cdef int i
-        cdef hash_t key
-        for i in range(self.keys.size()):
-            key = self.keys[i]
-            utf8str = <Utf8Str*>self._map.get(key)
-            yield decode_Utf8Str(utf8str)
-        # TODO: Iterate OOV here?
+        strings = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            utf8str = <Utf8Str*>self._map.get(self._keys[i])
+            strings[i] = self._decode_str_repr(utf8str)
+        return strings
 
-    def __reduce__(self):
-        strings = list(self)
-        return (StringStore, (strings,), None, None, None)
+    def values(self) -> List[int]:
+        """Iterate over the stored strings hashes in insertion order.
+
+        RETURNS: A list of string hashs.
+        """
+        cdef int i
+        hashes = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            hashes[i] = self._keys[i]
+        return hashes
 
     def to_disk(self, path):
         """Save the current state to a directory.
@@ -295,24 +201,122 @@ cdef class StringStore:
     def _reset_and_load(self, strings):
         self.mem = Pool()
         self._map = PreshMap()
-        self.keys.clear()
+        self._keys.clear()
         for string in strings:
             self.add(string)
 
-    cdef const Utf8Str* intern_unicode(self, str py_string):
-        # 0 means missing, but we don't bother offsetting the index.
-        cdef bytes byte_string = py_string.encode("utf8")
-        return self._intern_utf8(byte_string, len(byte_string), NULL)
+    def _get_interned_str(self, hash_value: int) -> str:
+        cdef hash_t str_hash
+        if not _try_coerce_to_hash(hash_value, &str_hash):
+            raise TypeError(Errors.E4001.format(expected_types="'int'", received_type=type(hash_value)))
+
+        # Handle reserved symbols and empty strings correctly.
+        if str_hash == 0:
+            return ""
 
-    @cython.final
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
+        symbol = SYMBOLS_BY_INT.get(str_hash)
+        if symbol is not None:
+            return symbol
+
+        utf8str = <Utf8Str*>self._map.get(str_hash)
+        if utf8str is NULL:
+            raise KeyError(Errors.E018.format(hash_value=str_hash))
+        else:
+            return self._decode_str_repr(utf8str)
+
+    cdef hash_t _intern_str(self, str string):
         # TODO: This function's API/behaviour is an unholy mess...
         # 0 means missing, but we don't bother offsetting the index.
-        cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
+        chars = string.encode('utf-8')
+        cdef hash_t key = hash64(<unsigned char*>chars, len(chars), 1)
         cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
         if value is not NULL:
-            return value
-        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
+            return key
+
+        value = self._allocate_str_repr(<unsigned char*>chars, len(chars))
         self._map.set(key, value)
-        self.keys.push_back(key)
-        return value
+        self._keys.push_back(key)
+        return key
+
+    cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *:
+        cdef int n_length_bytes
+        cdef int i
+        cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
+        cdef uint32_t ulength = length
+        if length < sizeof(string.s):
+            string.s[0] = <unsigned char>length
+            memcpy(&string.s[1], chars, length)
+            return string
+        elif length < 255:
+            string.p = <unsigned char*>self.mem.alloc(length + 1, sizeof(unsigned char))
+            string.p[0] = length
+            memcpy(&string.p[1], chars, length)
+            return string
+        else:
+            i = 0
+            n_length_bytes = (length // 255) + 1
+            string.p = <unsigned char*>self.mem.alloc(length + n_length_bytes, sizeof(unsigned char))
+            for i in range(n_length_bytes-1):
+                string.p[i] = 255
+            string.p[n_length_bytes-1] = length % 255
+            memcpy(&string.p[n_length_bytes], chars, length)
+            return string
+
+    cdef str _decode_str_repr(self, const Utf8Str* string):
+        cdef int i, length
+        if string.s[0] < sizeof(string.s) and string.s[0] != 0:
+            return string.s[1:string.s[0]+1].decode('utf-8')
+        elif string.p[0] < 255:
+            return string.p[1:string.p[0]+1].decode('utf-8')
+        else:
+            i = 0
+            length = 0
+            while string.p[i] == 255:
+                i += 1
+                length += 255
+            length += string.p[i]
+            i += 1
+            return string.p[i:length + i].decode('utf-8')
+
+
+cpdef hash_t hash_string(object string) except -1:
+    if not isinstance(string, str):
+        raise TypeError(Errors.E4001.format(expected_types="'str'", received_type=type(string)))
+
+    # Handle reserved symbols and empty strings correctly.
+    if len(string) == 0:
+        return 0
+
+    symbol = SYMBOLS_BY_STR.get(string)
+    if symbol is not None:
+        return symbol
+
+    chars = string.encode('utf-8')
+    return hash64(<unsigned char*>chars, len(chars), 1)
+
+
+cpdef hash_t get_string_id(object string_or_hash) except -1:
+    cdef hash_t str_hash
+
+    try:
+        return hash_string(string_or_hash)
+    except:
+        if _try_coerce_to_hash(string_or_hash, &str_hash):
+            # Coerce the integral key to the expected primitive hash type.
+            # This ensures that custom/overloaded "primitive" data types
+            # such as those implemented by numpy are not inadvertently used
+            # downsteam (as these are internally implemented as custom PyObjects
+            # whose comparison operators can incur a significant overhead).
+            return str_hash
+        else:
+            raise TypeError(Errors.E4001.format(expected_types="'str','int'", received_type=type(string_or_hash)))
+
+
+# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
+cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
+    try:
+        out_hash[0] = key
+        return True
+    except:
+        return False
+
diff --git a/spacy/tests/vocab_vectors/test_stringstore.py b/spacy/tests/vocab_vectors/test_stringstore.py
index 61039fffd4c..68c307939d3 100644
--- a/spacy/tests/vocab_vectors/test_stringstore.py
+++ b/spacy/tests/vocab_vectors/test_stringstore.py
@@ -25,6 +25,14 @@ def test_stringstore_from_api_docs(stringstore):
     stringstore.add("orange")
     all_strings = [s for s in stringstore]
     assert all_strings == ["apple", "orange"]
+    assert all_strings == list(stringstore.keys())
+    all_strings_and_hashes = list(stringstore.items())
+    assert all_strings_and_hashes == [
+        ("apple", 8566208034543834098),
+        ("orange", 2208928596161743350),
+    ]
+    all_hashes = list(stringstore.values())
+    assert all_hashes == [8566208034543834098, 2208928596161743350]
     banana_hash = stringstore.add("banana")
     assert len(stringstore) == 3
     assert banana_hash == 2525716904149915114
@@ -32,12 +40,25 @@ def test_stringstore_from_api_docs(stringstore):
     assert stringstore["banana"] == banana_hash
 
 
-@pytest.mark.parametrize("text1,text2,text3", [(b"Hello", b"goodbye", b"hello")])
-def test_stringstore_save_bytes(stringstore, text1, text2, text3):
-    key = stringstore.add(text1)
-    assert stringstore[text1] == key
-    assert stringstore[text2] != key
-    assert stringstore[text3] != key
+@pytest.mark.parametrize(
+    "val_bytes,val_float,val_list,val_text,val_hash",
+    [(b"Hello", 1.1, ["abc"], "apple", 8566208034543834098)],
+)
+def test_stringstore_type_checking(
+    stringstore, val_bytes, val_float, val_list, val_text, val_hash
+):
+    with pytest.raises(TypeError):
+        assert stringstore[val_bytes]
+
+    with pytest.raises(TypeError):
+        stringstore.add(val_float)
+
+    with pytest.raises(TypeError):
+        assert val_list not in stringstore
+
+    key = stringstore.add(val_text)
+    assert val_hash == key
+    assert stringstore[val_hash] == val_text
 
 
 @pytest.mark.parametrize("text1,text2,text3", [("Hello", "goodbye", "hello")])
@@ -48,19 +69,19 @@ def test_stringstore_save_unicode(stringstore, text1, text2, text3):
     assert stringstore[text3] != key
 
 
-@pytest.mark.parametrize("text", [b"A"])
+@pytest.mark.parametrize("text", ["A"])
 def test_stringstore_retrieve_id(stringstore, text):
     key = stringstore.add(text)
     assert len(stringstore) == 1
-    assert stringstore[key] == text.decode("utf8")
+    assert stringstore[key] == text
     with pytest.raises(KeyError):
         stringstore[20000]
 
 
-@pytest.mark.parametrize("text1,text2", [(b"0123456789", b"A")])
+@pytest.mark.parametrize("text1,text2", [("0123456789", "A")])
 def test_stringstore_med_string(stringstore, text1, text2):
     store = stringstore.add(text1)
-    assert stringstore[store] == text1.decode("utf8")
+    assert stringstore[store] == text1
     stringstore.add(text2)
     assert stringstore[text1] == store
 
diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx
index 6c4ce6ce358..22ce18181a7 100644
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@@ -16,9 +16,7 @@ from murmurhash.mrmr cimport hash64
 from .. import Errors
 
 from ..typedefs cimport hash_t
-
-from ..strings import get_string_id
-
+from ..strings cimport get_string_id
 from ..structs cimport EdgeC, GraphC
 
 from .token import Token
diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx
index b0e4ff85c9f..d3e9c5674cc 100644
--- a/spacy/tokens/retokenizer.pyx
+++ b/spacy/tokens/retokenizer.pyx
@@ -15,9 +15,7 @@ from .token cimport Token
 
 from ..attrs import intify_attrs
 from ..errors import Errors
-from ..strings import get_string_id
-from ..util import SimpleFrozenDict
-from .underscore import is_writable_attr
+from ..strings cimport get_string_id
 
 
 cdef class Retokenizer:
diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index 6a3e9d6644e..d4d85e6d56a 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -47,7 +47,8 @@ Get the number of strings in the store.
 
 ## StringStore.\_\_getitem\_\_ {id="getitem",tag="method"}
 
-Retrieve a string from a given hash, or vice versa.
+Retrieve a string from a given hash. If a string is passed as the input, add it
+to the store and return its hash.
 
 > #### Example
 >
@@ -58,14 +59,14 @@ Retrieve a string from a given hash, or vice versa.
 > assert stringstore[apple_hash] == "apple"
 > ```
 
-| Name           | Description                                     |
-| -------------- | ----------------------------------------------- |
-| `string_or_id` | The value to encode. ~~Union[bytes, str, int]~~ |
-| **RETURNS**    | The value to be retrieved. ~~Union[str, int]~~  |
+| Name             | Description                                                                  |
+| ---------------- | ---------------------------------------------------------------------------- |
+| `string_or_hash` | The hash value to lookup or the string to store. ~~Union[str, int]~~         |
+| **RETURNS**      | The stored string or the hash of the newly added string. ~~Union[str, int]~~ |
 
 ## StringStore.\_\_contains\_\_ {id="contains",tag="method"}
 
-Check whether a string is in the store.
+Check whether a string or a hash is in the store.
 
 > #### Example
 >
@@ -75,15 +76,14 @@ Check whether a string is in the store.
 > assert not "cherry" in stringstore
 > ```
 
-| Name        | Description                                     |
-| ----------- | ----------------------------------------------- |
-| `string`    | The string to check. ~~str~~                    |
-| **RETURNS** | Whether the store contains the string. ~~bool~~ |
+| Name             | Description                                             |
+| ---------------- | ------------------------------------------------------- |
+| `string_or_hash` | The string or hash to check. ~~Union[str, int]~~        |
+| **RETURNS**      | Whether the store contains the string or hash. ~~bool~~ |
 
 ## StringStore.\_\_iter\_\_ {id="iter",tag="method"}
 
-Iterate over the strings in the store, in order. Note that a newly initialized
-store will always include an empty string `""` at position `0`.
+Iterate over the stored strings in insertion order.
 
 > #### Example
 >
@@ -93,11 +93,59 @@ store will always include an empty string `""` at position `0`.
 > assert all_strings == ["apple", "orange"]
 > ```
 
-| Name       | Description                    |
-| ---------- | ------------------------------ |
-| **YIELDS** | A string in the store. ~~str~~ |
+| Name        | Description                    |
+| ----------- | ------------------------------ |
+| **RETURNS** | A string in the store. ~~str~~ |
 
-## StringStore.add {id="add",tag="method",version="2"}
+## StringStore.items {#iter tag="method" new="4"}
+
+Iterate over the stored string-hash pairs in insertion order.
+
+> #### Example
+>
+> ```python
+> stringstore = StringStore(["apple", "orange"])
+> all_strings_and_hashes = stringstore.items()
+> assert all_strings_and_hashes == [("apple", 8566208034543834098), ("orange", 2208928596161743350)]
+> ```
+
+| Name        | Description                                            |
+| ----------- | ------------------------------------------------------ |
+| **RETURNS** | A list of string-hash pairs. ~~List[Tuple[str, int]]~~ |
+
+## StringStore.keys {#iter tag="method" new="4"}
+
+Iterate over the stored strings in insertion order.
+
+> #### Example
+>
+> ```python
+> stringstore = StringStore(["apple", "orange"])
+> all_strings = stringstore.keys()
+> assert all_strings == ["apple", "orange"]
+> ```
+
+| Name        | Description                      |
+| ----------- | -------------------------------- |
+| **RETURNS** | A list of strings. ~~List[str]~~ |
+
+## StringStore.values {#iter tag="method" new="4"}
+
+Iterate over the stored string hashes in insertion order.
+
+> #### Example
+>
+> ```python
+> stringstore = StringStore(["apple", "orange"])
+> all_hashes = stringstore.values()
+> assert all_hashes == [8566208034543834098, 2208928596161743350]
+> ```
+
+| Name        | Description                            |
+| ----------- | -------------------------------------- |
+| **RETURNS** | A list of string hashes. ~~List[int]~~ |
+
+## StringStore.add {#add tag="method"}
 
 Add a string to the `StringStore`.
 
@@ -117,7 +165,7 @@ Add a string to the `StringStore`.
 | `string`    | The string to add. ~~str~~       |
 | **RETURNS** | The string's hash value. ~~int~~ |
 
-## StringStore.to_disk {id="to_disk",tag="method",version="2"}
+## StringStore.to_disk {#to_disk tag="method"}
 
 Save the current state to a directory.
 

From a3f3b7ac7597e0d21310137d60c237a3dfefda57 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 21 Oct 2022 18:01:18 +0900
Subject: [PATCH 019/504] Remove thinc util reimports (#11665)

* Remove imports marked as v2 leftovers

There are a few functions that were in `spacy.util` in v2, but were
moved to Thinc. In v3 these were imported in `spacy.util` so that code
could be used unchanged, but the comment over them indicates they should
always be imported from Thinc. This commit removes those imports.

It doesn't look like any DeprecationWarning was ever thrown for using
these, but it is probably fine to remove them anyway with a major
version. It is not clear that they were widely used.

* Import fix_random_seed correctly

This seems to be the only place in spaCy that was using the old import.
---
 spacy/tests/pipeline/test_spancat.py | 7 +++----
 spacy/util.py                        | 8 +++-----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 9678e9b63b8..5dcc2e70f67 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -1,7 +1,6 @@
 import numpy
-import pytest
-from numpy.testing import assert_almost_equal, assert_array_equal
-from thinc.api import NumpyOps, Ragged, get_current_ops
+from numpy.testing import assert_array_equal, assert_almost_equal
+from thinc.api import get_current_ops, Ragged, fix_random_seed
 
 from spacy import util
 from spacy.lang.en import English
@@ -9,7 +8,7 @@
 from spacy.tokens import SpanGroup
 from spacy.tokens.span_groups import SpanGroups
 from spacy.training import Example
-from spacy.util import fix_random_seed, make_tempdir, registry
+from spacy.util import registry, make_tempdir
 
 OPS = get_current_ops()
 
diff --git a/spacy/util.py b/spacy/util.py
index c127be03c37..8068c4bcec9 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -63,12 +63,10 @@
 except ImportError:
     cupy = None
 
-# These are functions that were previously (v2.x) available from spacy.util
-# and have since moved to Thinc. We're importing them here so people's code
-# doesn't break, but they should always be imported from Thinc from now on,
-# not from spacy.util.
-from thinc.api import compounding, decaying, fix_random_seed  # noqa: F401
 
+from .symbols import ORTH
+from .compat import cupy, CudaStream, is_windows, importlib_metadata
+from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
 from . import about
 from .compat import CudaStream, cupy, importlib_metadata, is_windows
 from .errors import OLD_MODEL_SHORTCUTS, Errors, Warnings

From 36f477a31aa0bc06dfcfa8817461234f6785e753 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 24 Oct 2022 09:11:35 +0200
Subject: [PATCH 020/504] Replace EntityRuler with SpanRuler implementation
 (#11320)

* Replace EntityRuler with SpanRuler implementation

Remove `EntityRuler` and rename the `SpanRuler`-based
`future_entity_ruler` to `entity_ruler`.

Main changes:

* It is no longer possible to load patterns on init as with
`EntityRuler(patterns=)`.
* The older serialization formats (`patterns.jsonl`) are no longer
supported and the related tests are removed.
* The config settings are only stored in the config, not in the
serialized component (in particular the `phrase_matcher_attr` and
overwrite settings).

* Add migration guide to EntityRuler API docs

* docs update

* Minor edit

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/errors.py                               |   8 +-
 spacy/pipeline/__init__.py                    |   2 -
 spacy/pipeline/entity_ruler.py                | 541 ------------------
 spacy/pipeline/span_ruler.py                  |  23 +-
 spacy/tests/matcher/test_phrase_matcher.py    |   9 +-
 spacy/tests/pipeline/test_entity_ruler.py     | 259 +++------
 .../serialize/test_serialize_pipeline.py      |  67 +--
 website/docs/api/entityruler.mdx              | 311 ++--------
 website/docs/api/spanruler.mdx                |  13 +-
 website/docs/usage/101/_architecture.mdx      |  40 +-
 website/docs/usage/101/_pipelines.mdx         |   6 +-
 website/docs/usage/processing-pipelines.mdx   |   5 +-
 website/docs/usage/rule-based-matching.mdx    |  43 +-
 website/docs/usage/saving-loading.mdx         |  10 +-
 website/docs/usage/training.mdx               |   2 +-
 15 files changed, 245 insertions(+), 1094 deletions(-)
 delete mode 100644 spacy/pipeline/entity_ruler.py

diff --git a/spacy/errors.py b/spacy/errors.py
index 9814679eb7d..965c92066bc 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -470,13 +470,13 @@ class Errors(metaclass=ErrorsWithCodes):
             "same, but found '{nlp}' and '{vocab}' respectively.")
     E152 = ("The attribute {attr} is not supported for token patterns. "
             "Please use the option `validate=True` with the Matcher, PhraseMatcher, "
-            "EntityRuler or AttributeRuler for more details.")
+            "SpanRuler or AttributeRuler for more details.")
     E153 = ("The value type {vtype} is not supported for token patterns. "
             "Please use the option validate=True with Matcher, PhraseMatcher, "
-            "EntityRuler or AttributeRuler for more details.")
+            "SpanRuler or AttributeRuler for more details.")
     E154 = ("One of the attributes or values is not supported for token "
             "patterns. Please use the option `validate=True` with the Matcher, "
-            "PhraseMatcher, or EntityRuler for more details.")
+            "PhraseMatcher, or SpanRuler for more details.")
     E155 = ("The pipeline needs to include a {pipe} in order to use "
             "Matcher or PhraseMatcher with the attribute {attr}. "
             "Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` "
@@ -933,8 +933,6 @@ class Errors(metaclass=ErrorsWithCodes):
     E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
              "Non-UD tags should use the `tag` property.")
     E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
-    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
-             "exist.")
     E1024 = ("A pattern with {attr_type} '{label}' is not present in "
              "'{component}' patterns.")
     E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 82d24486a27..e26f7436efa 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -3,7 +3,6 @@
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
 from .ner import EntityRecognizer
-from .entity_ruler import EntityRuler
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .ner import EntityRecognizer
@@ -25,7 +24,6 @@
     "EditTreeLemmatizer",
     "EntityLinker",
     "EntityRecognizer",
-    "EntityRuler",
     "Morphologizer",
     "Lemmatizer",
     "MultiLabel_TextCategorizer",
diff --git a/spacy/pipeline/entity_ruler.py b/spacy/pipeline/entity_ruler.py
deleted file mode 100644
index 3683cfc0270..00000000000
--- a/spacy/pipeline/entity_ruler.py
+++ /dev/null
@@ -1,541 +0,0 @@
-import warnings
-from collections import defaultdict
-from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
-
-import srsly
-
-from ..errors import Errors, Warnings
-from ..language import Language
-from ..matcher import Matcher, PhraseMatcher
-from ..matcher.levenshtein import levenshtein_compare
-from ..scorer import get_ner_prf
-from ..tokens import Doc, Span
-from ..training import Example
-from ..util import SimpleFrozenList, ensure_path, from_disk, registry, to_disk
-from .pipe import Pipe
-
-DEFAULT_ENT_ID_SEP = "||"
-PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
-
-
-@Language.factory(
-    "entity_ruler",
-    assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
-    default_config={
-        "phrase_matcher_attr": None,
-        "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
-        "validate": False,
-        "overwrite_ents": False,
-        "ent_id_sep": DEFAULT_ENT_ID_SEP,
-        "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
-    },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
-)
-def make_entity_ruler(
-    nlp: Language,
-    name: str,
-    phrase_matcher_attr: Optional[Union[int, str]],
-    matcher_fuzzy_compare: Callable,
-    validate: bool,
-    overwrite_ents: bool,
-    ent_id_sep: str,
-    scorer: Optional[Callable],
-):
-    return EntityRuler(
-        nlp,
-        name,
-        phrase_matcher_attr=phrase_matcher_attr,
-        matcher_fuzzy_compare=matcher_fuzzy_compare,
-        validate=validate,
-        overwrite_ents=overwrite_ents,
-        ent_id_sep=ent_id_sep,
-        scorer=scorer,
-    )
-
-
-def entity_ruler_score(examples, **kwargs):
-    return get_ner_prf(examples)
-
-
-@registry.scorers("spacy.entity_ruler_scorer.v1")
-def make_entity_ruler_scorer():
-    return entity_ruler_score
-
-
-class EntityRuler(Pipe):
-    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
-    rules or exact phrase matches. It can be combined with the statistical
-    `EntityRecognizer` to boost accuracy, or used on its own to implement a
-    purely rule-based entity recognition system. After initialization, the
-    component is typically added to the pipeline using `nlp.add_pipe`.
-
-    DOCS: https://spacy.io/api/entityruler
-    USAGE: https://spacy.io/usage/rule-based-matching#entityruler
-    """
-
-    def __init__(
-        self,
-        nlp: Language,
-        name: str = "entity_ruler",
-        *,
-        phrase_matcher_attr: Optional[Union[int, str]] = None,
-        matcher_fuzzy_compare: Callable = levenshtein_compare,
-        validate: bool = False,
-        overwrite_ents: bool = False,
-        ent_id_sep: str = DEFAULT_ENT_ID_SEP,
-        patterns: Optional[List[PatternType]] = None,
-        scorer: Optional[Callable] = entity_ruler_score,
-    ) -> None:
-        """Initialize the entity ruler. If patterns are supplied here, they
-        need to be a list of dictionaries with a `"label"` and `"pattern"`
-        key. A pattern can either be a token pattern (list) or a phrase pattern
-        (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
-
-        nlp (Language): The shared nlp object to pass the vocab to the matchers
-            and process phrase patterns.
-        name (str): Instance name of the current pipeline component. Typically
-            passed in automatically from the factory when the component is
-            added. Used to disable the current entity ruler while creating
-            phrase patterns with the nlp object.
-        phrase_matcher_attr (int / str): Token attribute to match on, passed
-            to the internal PhraseMatcher as `attr`.
-        matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
-            internal Matcher. Defaults to
-            spacy.matcher.levenshtein.levenshtein_compare.
-        validate (bool): Whether patterns should be validated, passed to
-            Matcher and PhraseMatcher as `validate`
-        patterns (iterable): Optional patterns to load in.
-        overwrite_ents (bool): If existing entities are present, e.g. entities
-            added by the model, overwrite them by matches if necessary.
-        ent_id_sep (str): Separator used internally for entity IDs.
-        scorer (Optional[Callable]): The scoring method. Defaults to
-            spacy.scorer.get_ner_prf.
-
-        DOCS: https://spacy.io/api/entityruler#init
-        """
-        self.nlp = nlp
-        self.name = name
-        self.overwrite = overwrite_ents
-        self.token_patterns = defaultdict(list)  # type: ignore
-        self.phrase_patterns = defaultdict(list)  # type: ignore
-        self._validate = validate
-        self.matcher_fuzzy_compare = matcher_fuzzy_compare
-        self.matcher = Matcher(
-            nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare
-        )
-        self.phrase_matcher_attr = phrase_matcher_attr
-        self.phrase_matcher = PhraseMatcher(
-            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
-        )
-        self.ent_id_sep = ent_id_sep
-        self._ent_ids = defaultdict(tuple)  # type: ignore
-        if patterns is not None:
-            self.add_patterns(patterns)
-        self.scorer = scorer
-
-    def __len__(self) -> int:
-        """The number of all patterns added to the entity ruler."""
-        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
-        n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
-        return n_token_patterns + n_phrase_patterns
-
-    def __contains__(self, label: str) -> bool:
-        """Whether a label is present in the patterns."""
-        return label in self.token_patterns or label in self.phrase_patterns
-
-    def __call__(self, doc: Doc) -> Doc:
-        """Find matches in document and add them as entities.
-
-        doc (Doc): The Doc object in the pipeline.
-        RETURNS (Doc): The Doc with added entities, if available.
-
-        DOCS: https://spacy.io/api/entityruler#call
-        """
-        error_handler = self.get_error_handler()
-        try:
-            matches = self.match(doc)
-            self.set_annotations(doc, matches)
-            return doc
-        except Exception as e:
-            return error_handler(self.name, self, [doc], e)
-
-    def match(self, doc: Doc):
-        self._require_patterns()
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", message="\\[W036")
-            matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
-
-        final_matches = set(
-            [(m_id, start, end) for m_id, start, end in matches if start != end]
-        )
-        get_sort_key = lambda m: (m[2] - m[1], -m[1])
-        final_matches = sorted(final_matches, key=get_sort_key, reverse=True)
-        return final_matches
-
-    def set_annotations(self, doc, matches):
-        """Modify the document in place"""
-        entities = list(doc.ents)
-        new_entities = []
-        seen_tokens = set()
-        for match_id, start, end in matches:
-            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
-                continue
-            # check for end - 1 here because boundaries are inclusive
-            if start not in seen_tokens and end - 1 not in seen_tokens:
-                if match_id in self._ent_ids:
-                    label, ent_id = self._ent_ids[match_id]
-                    span = Span(doc, start, end, label=label, span_id=ent_id)
-                else:
-                    span = Span(doc, start, end, label=match_id)
-                new_entities.append(span)
-                entities = [
-                    e for e in entities if not (e.start < end and e.end > start)
-                ]
-                seen_tokens.update(range(start, end))
-        doc.ents = entities + new_entities
-
-    @property
-    def labels(self) -> Tuple[str, ...]:
-        """All labels present in the match patterns.
-
-        RETURNS (set): The string labels.
-
-        DOCS: https://spacy.io/api/entityruler#labels
-        """
-        keys = set(self.token_patterns.keys())
-        keys.update(self.phrase_patterns.keys())
-        all_labels = set()
-
-        for l in keys:
-            if self.ent_id_sep in l:
-                label, _ = self._split_label(l)
-                all_labels.add(label)
-            else:
-                all_labels.add(l)
-        return tuple(sorted(all_labels))
-
-    def initialize(
-        self,
-        get_examples: Callable[[], Iterable[Example]],
-        *,
-        nlp: Optional[Language] = None,
-        patterns: Optional[Sequence[PatternType]] = None,
-    ):
-        """Initialize the pipe for training.
-
-        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns a representative sample of gold-standard Example objects.
-        nlp (Language): The current nlp object the component is part of.
-        patterns Optional[Iterable[PatternType]]: The list of patterns.
-
-        DOCS: https://spacy.io/api/entityruler#initialize
-        """
-        self.clear()
-        if patterns:
-            self.add_patterns(patterns)  # type: ignore[arg-type]
-
-    @property
-    def ent_ids(self) -> Tuple[Optional[str], ...]:
-        """All entity ids present in the match patterns `id` properties
-
-        RETURNS (set): The string entity ids.
-
-        DOCS: https://spacy.io/api/entityruler#ent_ids
-        """
-        keys = set(self.token_patterns.keys())
-        keys.update(self.phrase_patterns.keys())
-        all_ent_ids = set()
-
-        for l in keys:
-            if self.ent_id_sep in l:
-                _, ent_id = self._split_label(l)
-                all_ent_ids.add(ent_id)
-        return tuple(all_ent_ids)
-
-    @property
-    def patterns(self) -> List[PatternType]:
-        """Get all patterns that were added to the entity ruler.
-
-        RETURNS (list): The original patterns, one dictionary per pattern.
-
-        DOCS: https://spacy.io/api/entityruler#patterns
-        """
-        all_patterns = []
-        for label, patterns in self.token_patterns.items():
-            for pattern in patterns:
-                ent_label, ent_id = self._split_label(label)
-                p = {"label": ent_label, "pattern": pattern}
-                if ent_id:
-                    p["id"] = ent_id
-                all_patterns.append(p)
-        for label, patterns in self.phrase_patterns.items():
-            for pattern in patterns:
-                ent_label, ent_id = self._split_label(label)
-                p = {"label": ent_label, "pattern": pattern.text}
-                if ent_id:
-                    p["id"] = ent_id
-                all_patterns.append(p)
-        return all_patterns
-
-    def add_patterns(self, patterns: List[PatternType]) -> None:
-        """Add patterns to the entity ruler. A pattern can either be a token
-        pattern (list of dicts) or a phrase pattern (string). For example:
-        {'label': 'ORG', 'pattern': 'Apple'}
-        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
-
-        patterns (list): The patterns to add.
-
-        DOCS: https://spacy.io/api/entityruler#add_patterns
-        """
-
-        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
-        try:
-            current_index = -1
-            for i, (name, pipe) in enumerate(self.nlp.pipeline):
-                if self == pipe:
-                    current_index = i
-                    break
-            subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
-        except ValueError:
-            subsequent_pipes = []
-        with self.nlp.select_pipes(disable=subsequent_pipes):
-            token_patterns = []
-            phrase_pattern_labels = []
-            phrase_pattern_texts = []
-            phrase_pattern_ids = []
-            for entry in patterns:
-                if isinstance(entry["pattern"], str):
-                    phrase_pattern_labels.append(entry["label"])
-                    phrase_pattern_texts.append(entry["pattern"])
-                    phrase_pattern_ids.append(entry.get("id"))
-                elif isinstance(entry["pattern"], list):
-                    token_patterns.append(entry)
-            phrase_patterns = []
-            for label, pattern, ent_id in zip(
-                phrase_pattern_labels,
-                self.nlp.pipe(phrase_pattern_texts),
-                phrase_pattern_ids,
-            ):
-                phrase_pattern = {"label": label, "pattern": pattern}
-                if ent_id:
-                    phrase_pattern["id"] = ent_id
-                phrase_patterns.append(phrase_pattern)
-            for entry in token_patterns + phrase_patterns:  # type: ignore[operator]
-                label = entry["label"]  # type: ignore
-                if "id" in entry:
-                    ent_label = label
-                    label = self._create_label(label, entry["id"])
-                    key = self.matcher._normalize_key(label)
-                    self._ent_ids[key] = (ent_label, entry["id"])
-                pattern = entry["pattern"]  # type: ignore
-                if isinstance(pattern, Doc):
-                    self.phrase_patterns[label].append(pattern)
-                    self.phrase_matcher.add(label, [pattern])  # type: ignore
-                elif isinstance(pattern, list):
-                    self.token_patterns[label].append(pattern)
-                    self.matcher.add(label, [pattern])
-                else:
-                    raise ValueError(Errors.E097.format(pattern=pattern))
-
-    def clear(self) -> None:
-        """Reset all patterns."""
-        self.token_patterns = defaultdict(list)
-        self.phrase_patterns = defaultdict(list)
-        self._ent_ids = defaultdict(tuple)
-        self.matcher = Matcher(
-            self.nlp.vocab,
-            validate=self._validate,
-            fuzzy_compare=self.matcher_fuzzy_compare,
-        )
-        self.phrase_matcher = PhraseMatcher(
-            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
-        )
-
-    def remove(self, ent_id: str) -> None:
-        """Remove a pattern by its ent_id if a pattern with this ent_id was added before
-
-        ent_id (str): id of the pattern to be removed
-        RETURNS: None
-        DOCS: https://spacy.io/api/entityruler#remove
-        """
-        label_id_pairs = [
-            (label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id
-        ]
-        if not label_id_pairs:
-            raise ValueError(
-                Errors.E1024.format(attr_type="ID", label=ent_id, component=self.name)
-            )
-        created_labels = [
-            self._create_label(label, eid) for (label, eid) in label_id_pairs
-        ]
-        # remove the patterns from self.phrase_patterns
-        self.phrase_patterns = defaultdict(
-            list,
-            {
-                label: val
-                for (label, val) in self.phrase_patterns.items()
-                if label not in created_labels
-            },
-        )
-        # remove the patterns from self.token_pattern
-        self.token_patterns = defaultdict(
-            list,
-            {
-                label: val
-                for (label, val) in self.token_patterns.items()
-                if label not in created_labels
-            },
-        )
-        # remove the patterns from self.token_pattern
-        for label in created_labels:
-            if label in self.phrase_matcher:
-                self.phrase_matcher.remove(label)
-            else:
-                self.matcher.remove(label)
-
-    def _require_patterns(self) -> None:
-        """Raise a warning if this component has no patterns defined."""
-        if len(self) == 0:
-            warnings.warn(Warnings.W036.format(name=self.name))
-
-    def _split_label(self, label: str) -> Tuple[str, Optional[str]]:
-        """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
-
-        label (str): The value of label in a pattern entry
-        RETURNS (tuple): ent_label, ent_id
-        """
-        if self.ent_id_sep in label:
-            ent_label, ent_id = label.rsplit(self.ent_id_sep, 1)
-        else:
-            ent_label = label
-            ent_id = None  # type: ignore
-        return ent_label, ent_id
-
-    def _create_label(self, label: Any, ent_id: Any) -> str:
-        """Join Entity label with ent_id if the pattern has an `id` attribute
-        If ent_id is not a string, the label is returned as is.
-
-        label (str): The label to set for ent.label_
-        ent_id (str): The label
-        RETURNS (str): The ent_label joined with configured `ent_id_sep`
-        """
-        if isinstance(ent_id, str):
-            label = f"{label}{self.ent_id_sep}{ent_id}"
-        return label
-
-    def from_bytes(
-        self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityRuler":
-        """Load the entity ruler from a bytestring.
-
-        patterns_bytes (bytes): The bytestring to load.
-        RETURNS (EntityRuler): The loaded entity ruler.
-
-        DOCS: https://spacy.io/api/entityruler#from_bytes
-        """
-        cfg = srsly.msgpack_loads(patterns_bytes)
-        self.clear()
-        if isinstance(cfg, dict):
-            self.add_patterns(cfg.get("patterns", cfg))
-            self.overwrite = cfg.get("overwrite", False)
-            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab,
-                attr=self.phrase_matcher_attr,
-            )
-            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
-        else:
-            self.add_patterns(cfg)
-        return self
-
-    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
-        """Serialize the entity ruler patterns to a bytestring.
-
-        RETURNS (bytes): The serialized patterns.
-
-        DOCS: https://spacy.io/api/entityruler#to_bytes
-        """
-        serial = {
-            "overwrite": self.overwrite,
-            "ent_id_sep": self.ent_id_sep,
-            "phrase_matcher_attr": self.phrase_matcher_attr,
-            "patterns": self.patterns,
-        }
-        return srsly.msgpack_dumps(serial)
-
-    def from_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityRuler":
-        """Load the entity ruler from a file. Expects a file containing
-        newline-delimited JSON (JSONL) with one entry per line.
-
-        path (str / Path): The JSONL file to load.
-        RETURNS (EntityRuler): The loaded entity ruler.
-
-        DOCS: https://spacy.io/api/entityruler#from_disk
-        """
-        path = ensure_path(path)
-        self.clear()
-        depr_patterns_path = path.with_suffix(".jsonl")
-        if path.suffix == ".jsonl":  # user provides a jsonl
-            if path.is_file:
-                patterns = srsly.read_jsonl(path)
-                self.add_patterns(patterns)
-            else:
-                raise ValueError(Errors.E1023.format(path=path))
-        elif depr_patterns_path.is_file():
-            patterns = srsly.read_jsonl(depr_patterns_path)
-            self.add_patterns(patterns)
-        elif path.is_dir():  # path is a valid directory
-            cfg = {}
-            deserializers_patterns = {
-                "patterns": lambda p: self.add_patterns(
-                    srsly.read_jsonl(p.with_suffix(".jsonl"))
-                )
-            }
-            deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
-            from_disk(path, deserializers_cfg, {})
-            self.overwrite = cfg.get("overwrite", False)
-            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
-            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
-
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab, attr=self.phrase_matcher_attr
-            )
-            from_disk(path, deserializers_patterns, {})
-        else:  # path is not a valid directory or file
-            raise ValueError(Errors.E146.format(path=path))
-        return self
-
-    def to_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> None:
-        """Save the entity ruler patterns to a directory. The patterns will be
-        saved as newline-delimited JSON (JSONL).
-
-        path (str / Path): The JSONL file to save.
-
-        DOCS: https://spacy.io/api/entityruler#to_disk
-        """
-        path = ensure_path(path)
-        cfg = {
-            "overwrite": self.overwrite,
-            "phrase_matcher_attr": self.phrase_matcher_attr,
-            "ent_id_sep": self.ent_id_sep,
-        }
-        serializers = {
-            "patterns": lambda p: srsly.write_jsonl(
-                p.with_suffix(".jsonl"), self.patterns
-            ),
-            "cfg": lambda p: srsly.write_json(p, cfg),
-        }
-        if path.suffix == ".jsonl":  # user wants to save only JSONL
-            srsly.write_jsonl(path, self.patterns)
-        else:
-            to_disk(path, serializers, {})
diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index 2a5e2179a35..4875c5e4bff 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -17,6 +17,14 @@
 
 import srsly
 
+from .pipe import Pipe
+from ..training import Example
+from ..language import Language
+from ..errors import Errors, Warnings
+from ..util import ensure_path, SimpleFrozenList, registry
+from ..tokens import Doc, Span
+from ..scorer import Scorer, get_ner_prf
+from ..matcher import Matcher, PhraseMatcher
 from .. import util
 from ..errors import Errors, Warnings
 from ..language import Language
@@ -33,7 +41,7 @@
 
 
 @Language.factory(
-    "future_entity_ruler",
+    "entity_ruler",
     assigns=["doc.ents"],
     default_config={
         "phrase_matcher_attr": None,
@@ -79,6 +87,15 @@ def make_entity_ruler(
     )
 
 
+def entity_ruler_score(examples, **kwargs):
+    return get_ner_prf(examples)
+
+
+@registry.scorers("spacy.entity_ruler_scorer.v1")
+def make_entity_ruler_scorer():
+    return entity_ruler_score
+
+
 @Language.factory(
     "span_ruler",
     assigns=["doc.spans"],
@@ -136,7 +153,7 @@ def prioritize_new_ents_filter(
 ) -> List[Span]:
     """Merge entities and spans into one list without overlaps by allowing
     spans to overwrite any entities that they overlap with. Intended to
-    replicate the overwrite_ents=True behavior from the EntityRuler.
+    replicate the overwrite_ents=True behavior from the v3 EntityRuler.
 
     entities (Iterable[Span]): The entities, already filtered for overlaps.
     spans (Iterable[Span]): The spans to merge, may contain overlaps.
@@ -167,7 +184,7 @@ def prioritize_existing_ents_filter(
 ) -> List[Span]:
     """Merge entities and spans into one list without overlaps by prioritizing
     existing entities. Intended to replicate the overwrite_ents=False behavior
-    from the EntityRuler.
+    from the v3 EntityRuler.
 
     entities (Iterable[Span]): The entities, already filtered for overlaps.
     spans (Iterable[Span]): The spans to merge, may contain overlaps.
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 4ad234cba3b..629f402f38e 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -87,14 +87,15 @@ def test_issue4373():
 
 @pytest.mark.issue(4651)
 def test_issue4651_with_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialized correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
+    """Test that the entity_ruler PhraseMatcher is deserialized correctly using
+    the method from_disk when the entity_ruler argument phrase_matcher_attr is
     specified.
     """
     text = "Spacy is a python library for nlp"
     nlp = English()
     patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
+    config = {"phrase_matcher_attr": "LOWER"}
+    ruler = nlp.add_pipe("entity_ruler", config=config)
     ruler.add_patterns(patterns)
     doc = nlp(text)
     res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
@@ -102,7 +103,7 @@ def test_issue4651_with_phrase_matcher_attr():
     with make_tempdir() as d:
         file_path = d / "entityruler"
         ruler.to_disk(file_path)
-        nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path)
+        nlp_reloaded.add_pipe("entity_ruler", config=config).from_disk(file_path)
     doc_reloaded = nlp_reloaded(text)
     res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
     assert res == res_reloaded
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index d0ab003919e..9f5204006ec 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -2,6 +2,12 @@
 from thinc.api import NumpyOps, get_current_ops
 
 from spacy import registry
+from spacy.tokens import Doc, Span
+from spacy.language import Language
+from spacy.lang.en import English
+from spacy.pipeline import EntityRecognizer, merge_entities
+from spacy.pipeline import SpanRuler
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
 from spacy.lang.en import English
 from spacy.language import Language
@@ -10,8 +16,6 @@
 from spacy.tests.util import make_tempdir
 from spacy.tokens import Doc, Span
 
-ENTITY_RULERS = ["entity_ruler", "future_entity_ruler"]
-
 
 @pytest.fixture
 def nlp():
@@ -38,13 +42,12 @@ def add_ent_component(doc):
 
 
 @pytest.mark.issue(3345)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue3345(entity_ruler_factory):
+def test_issue3345():
     """Test case where preset entity crosses sentence boundary."""
     nlp = English()
     doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
     doc[4].is_sent_start = True
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns([{"label": "GPE", "pattern": "New York"}])
     cfg = {"model": DEFAULT_NER_MODEL}
     model = registry.resolve(cfg, validate=True)["model"]
@@ -63,15 +66,14 @@ def test_issue3345(entity_ruler_factory):
 
 
 @pytest.mark.issue(4849)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue4849(entity_ruler_factory):
+def test_issue4849():
     nlp = English()
     patterns = [
         {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
         {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
     ]
     ruler = nlp.add_pipe(
-        entity_ruler_factory,
+        "entity_ruler",
         name="entity_ruler",
         config={"phrase_matcher_attr": "LOWER"},
     )
@@ -94,11 +96,10 @@ def test_issue4849(entity_ruler_factory):
 
 
 @pytest.mark.issue(5918)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue5918(entity_ruler_factory):
+def test_issue5918():
     # Test edge case when merging entities.
     nlp = English()
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "ORG", "pattern": "Digicon Inc"},
         {"label": "ORG", "pattern": "Rotan Mosle Inc's"},
@@ -123,10 +124,9 @@ def test_issue5918(entity_ruler_factory):
 
 
 @pytest.mark.issue(8168)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue8168(entity_ruler_factory):
+def test_issue8168():
     nlp = English()
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "ORG", "pattern": "Apple"},
         {
@@ -146,12 +146,9 @@ def test_issue8168(entity_ruler_factory):
 
 
 @pytest.mark.issue(8216)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_fix8216(nlp, patterns):
     """Test that patterns don't get added excessively."""
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"validate": True}
-    )
+    ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
     ruler.add_patterns(patterns)
     pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
     assert pattern_count > 0
@@ -160,16 +157,15 @@ def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory):
     assert after_count == pattern_count
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_init(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_init(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     assert "HELLO" in ruler
     assert "BYE" in ruler
     nlp.remove_pipe("entity_ruler")
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     doc = nlp("hello world bye bye")
     assert len(doc.ents) == 2
@@ -177,23 +173,21 @@ def test_entity_ruler_init(nlp, patterns, entity_ruler_factory):
     assert doc.ents[1].label_ == "BYE"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_no_patterns_warns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_no_patterns_warns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     assert len(ruler) == 0
     assert len(ruler.labels) == 0
     nlp.remove_pipe("entity_ruler")
-    nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    nlp.add_pipe("entity_ruler")
     assert nlp.pipe_names == ["entity_ruler"]
     with pytest.warns(UserWarning):
         doc = nlp("hello world bye bye")
     assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_init_patterns(nlp, patterns):
     # initialize with patterns
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     assert len(ruler.labels) == 0
     ruler.initialize(lambda: [], patterns=patterns)
     assert len(ruler.labels) == 4
@@ -205,7 +199,7 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
     nlp.config["initialize"]["components"]["entity_ruler"] = {
         "patterns": {"@misc": "entity_ruler_patterns"}
     }
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     assert len(ruler.labels) == 0
     nlp.initialize()
     assert len(ruler.labels) == 4
@@ -214,20 +208,18 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
     assert doc.ents[1].label_ == "BYE"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_init_clear(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_init_clear(nlp, patterns):
     """Test that initialization clears patterns."""
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler.labels) == 4
     ruler.initialize(lambda: [])
     assert len(ruler.labels) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_clear(nlp, patterns):
     """Test that initialization clears patterns."""
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler.labels) == 4
     doc = nlp("hello world")
@@ -239,9 +231,8 @@ def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory):
     assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_existing(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     nlp.add_pipe("add_ent", before="entity_ruler")
     doc = nlp("OH HELLO WORLD bye bye")
@@ -250,11 +241,8 @@ def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory):
     assert doc.ents[1].label_ == "BYE"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
-    )
+def test_entity_ruler_existing_overwrite(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
     ruler.add_patterns(patterns)
     nlp.add_pipe("add_ent", before="entity_ruler")
     doc = nlp("OH HELLO WORLD bye bye")
@@ -264,11 +252,8 @@ def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory):
     assert doc.ents[1].label_ == "BYE"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
-    )
+def test_entity_ruler_existing_complex(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
     ruler.add_patterns(patterns)
     nlp.add_pipe("add_ent", before="entity_ruler")
     doc = nlp("foo foo bye bye")
@@ -279,11 +264,8 @@ def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory):
     assert len(doc.ents[1]) == 2
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
-    )
+def test_entity_ruler_entity_id(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
     ruler.add_patterns(patterns)
     doc = nlp("Apple is a technology company")
     assert len(doc.ents) == 1
@@ -291,26 +273,23 @@ def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory):
     assert doc.ents[0].ent_id_ == "a1"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_cfg_ent_id_sep(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_cfg_ent_id_sep(nlp, patterns):
     config = {"overwrite_ents": True, "ent_id_sep": "**"}
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler", config=config)
+    ruler = nlp.add_pipe("entity_ruler", config=config)
     ruler.add_patterns(patterns)
     doc = nlp("Apple is a technology company")
-    if isinstance(ruler, EntityRuler):
-        assert "TECH_ORG**a1" in ruler.phrase_patterns
     assert len(doc.ents) == 1
     assert doc.ents[0].label_ == "TECH_ORG"
     assert doc.ents[0].ent_id_ == "a1"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory):
-    ruler = EntityRuler(nlp, patterns=patterns)
+def test_entity_ruler_serialize_bytes(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     ruler_bytes = ruler.to_bytes()
-    new_ruler = EntityRuler(nlp)
+    new_ruler = nlp.add_pipe("entity_ruler", name="new_ruler")
     assert len(new_ruler) == 0
     assert len(new_ruler.labels) == 0
     new_ruler = new_ruler.from_bytes(ruler_bytes)
@@ -322,28 +301,27 @@ def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory):
     assert sorted(new_ruler.labels) == sorted(ruler.labels)
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_phrase_matcher_attr_bytes(
-    nlp, patterns, entity_ruler_factory
-):
-    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER", patterns=patterns)
+def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
+    ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     ruler_bytes = ruler.to_bytes()
-    new_ruler = EntityRuler(nlp)
+    new_ruler = nlp.add_pipe(
+        "entity_ruler", name="new_ruler", config={"phrase_matcher_attr": "LOWER"}
+    )
     assert len(new_ruler) == 0
     assert len(new_ruler.labels) == 0
-    assert new_ruler.phrase_matcher_attr is None
     new_ruler = new_ruler.from_bytes(ruler_bytes)
     assert len(new_ruler) == len(patterns)
     assert len(new_ruler.labels) == 4
-    assert new_ruler.phrase_matcher_attr == "LOWER"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_validate(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
-    validated_ruler = EntityRuler(nlp, validate=True)
+def test_entity_ruler_validate(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
+    validated_ruler = nlp.add_pipe(
+        "entity_ruler", name="validated_ruler", config={"validate": True}
+    )
 
     valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
     invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
@@ -360,16 +338,15 @@ def test_entity_ruler_validate(nlp, entity_ruler_factory):
         validated_ruler.add_patterns([invalid_pattern])
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_properties(nlp, patterns, entity_ruler_factory):
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+def test_entity_ruler_properties(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
+    ruler.add_patterns(patterns)
     assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"])
-    assert sorted(ruler.ent_ids) == ["a1", "a2"]
+    assert sorted(ruler.ids) == ["a1", "a2"]
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_overlapping_spans(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "FOOBAR", "pattern": "foo bar"},
         {"label": "BARBAZ", "pattern": "bar baz"},
@@ -418,14 +395,13 @@ def make_test_fuzzy_compare_disabled():
 
 
 @pytest.mark.parametrize("n_process", [1, 2])
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
+def test_entity_ruler_multiprocessing(nlp, n_process):
     if isinstance(get_current_ops, NumpyOps) or n_process < 2:
         texts = ["I enjoy eating Pizza Hut pizza."]
 
         patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}]
 
-        ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+        ruler = nlp.add_pipe("entity_ruler")
         ruler.add_patterns(patterns)
 
         for doc in nlp.pipe(texts, n_process=2):
@@ -433,9 +409,8 @@ def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
                 assert ent.ent_id_ == "1234"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_serialize_jsonl(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     with make_tempdir() as d:
         ruler.to_disk(d / "test_ruler.jsonl")
@@ -444,9 +419,8 @@ def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory):
             ruler.from_disk(d / "non_existing.jsonl")  # read from a bad jsonl file
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_serialize_dir(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     with make_tempdir() as d:
         ruler.to_disk(d / "test_ruler")
@@ -455,9 +429,8 @@ def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory):
             ruler.from_disk(d / "non_existing_dir")  # read from a bad directory
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_basic(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_basic(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -467,24 +440,16 @@ def test_entity_ruler_remove_basic(nlp, entity_ruler_factory):
     doc = nlp("Dina went to school")
     assert len(ruler.patterns) == 3
     assert len(doc.ents) == 1
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" in ruler.phrase_matcher
     assert doc.ents[0].label_ == "PERSON"
     assert doc.ents[0].text == "Dina"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
     doc = nlp("Dina went to school")
     assert len(doc.ents) == 0
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" not in ruler.phrase_matcher
     assert len(ruler.patterns) == 2
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_same_id_multiple_patterns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "DinaCorp", "id": "dina"},
@@ -493,25 +458,15 @@ def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory
     ruler.add_patterns(patterns)
     doc = nlp("Dina founded DinaCorp and ACME.")
     assert len(ruler.patterns) == 3
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" in ruler.phrase_matcher
-        assert "ORG||dina" in ruler.phrase_matcher
     assert len(doc.ents) == 3
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
     doc = nlp("Dina founded DinaCorp and ACME.")
     assert len(ruler.patterns) == 1
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" not in ruler.phrase_matcher
-        assert "ORG||dina" not in ruler.phrase_matcher
     assert len(doc.ents) == 1
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_nonexisting_pattern(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -526,9 +481,8 @@ def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory):
             ruler.remove_by_id("nepattern")
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_several_patterns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -542,27 +496,20 @@ def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory):
     assert doc.ents[0].text == "Dina"
     assert doc.ents[1].label_ == "ORG"
     assert doc.ents[1].text == "ACME"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
     doc = nlp("Dina founded her company ACME")
     assert len(ruler.patterns) == 2
     assert len(doc.ents) == 1
     assert doc.ents[0].label_ == "ORG"
     assert doc.ents[0].text == "ACME"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("acme")
-    else:
-        ruler.remove_by_id("acme")
+    ruler.remove_by_id("acme")
     doc = nlp("Dina founded her company ACME")
     assert len(ruler.patterns) == 1
     assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_patterns_in_a_row(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -578,21 +525,15 @@ def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory):
     assert doc.ents[1].text == "ACME"
     assert doc.ents[2].label_ == "DATE"
     assert doc.ents[2].text == "her birthday"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-        ruler.remove("acme")
-        ruler.remove("bday")
-    else:
-        ruler.remove_by_id("dina")
-        ruler.remove_by_id("acme")
-        ruler.remove_by_id("bday")
+    ruler.remove_by_id("dina")
+    ruler.remove_by_id("acme")
+    ruler.remove_by_id("bday")
     doc = nlp("Dina went to school")
     assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_all_patterns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -600,29 +541,19 @@ def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory):
     ]
     ruler.add_patterns(patterns)
     assert len(ruler.patterns) == 3
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
     assert len(ruler.patterns) == 2
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("acme")
-    else:
-        ruler.remove_by_id("acme")
+    ruler.remove_by_id("acme")
     assert len(ruler.patterns) == 1
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("bday")
-    else:
-        ruler.remove_by_id("bday")
+    ruler.remove_by_id("bday")
     assert len(ruler.patterns) == 0
     with pytest.warns(UserWarning):
         doc = nlp("Dina founded her company ACME on her birthday")
         assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_and_add(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "DATE", "pattern": "last time"}]
     ruler.add_patterns(patterns)
     doc = ruler(
@@ -643,10 +574,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
     assert doc.ents[0].text == "last time"
     assert doc.ents[1].label_ == "DATE"
     assert doc.ents[1].text == "this time"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("ttime")
-    else:
-        ruler.remove_by_id("ttime")
+    ruler.remove_by_id("ttime")
     doc = ruler(
         nlp.make_doc("I saw him last time we met, this time he brought some flowers")
     )
@@ -669,10 +597,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
     )
     assert len(ruler.patterns) == 3
     assert len(doc.ents) == 3
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("ttime")
-    else:
-        ruler.remove_by_id("ttime")
+    ruler.remove_by_id("ttime")
     doc = ruler(
         nlp.make_doc(
             "I saw him last time we met, this time he brought some flowers, another time some chocolate."
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 6bbe743a12d..8170488f758 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -8,15 +8,9 @@
 from spacy import Vocab, load, registry
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import (
-    DependencyParser,
-    EntityRecognizer,
-    EntityRuler,
-    SentenceRecognizer,
-    Tagger,
-    TextCategorizer,
-    TrainablePipe,
-)
+from spacy.pipeline import DependencyParser, EntityRecognizer
+from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer
+from spacy.pipeline import TrainablePipe
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
@@ -91,58 +85,17 @@ def test_issue_3526_1(en_vocab):
         {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
     ]
     nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
+    ruler.add_patterns(patterns)
     ruler_bytes = ruler.to_bytes()
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
-    assert ruler.overwrite
-    new_ruler = EntityRuler(nlp)
+    new_ruler = nlp.add_pipe(
+        "entity_ruler", name="new_ruler", config={"overwrite_ents": True}
+    )
     new_ruler = new_ruler.from_bytes(ruler_bytes)
     assert len(new_ruler) == len(ruler)
     assert len(new_ruler.labels) == 4
-    assert new_ruler.overwrite == ruler.overwrite
-    assert new_ruler.ent_id_sep == ruler.ent_id_sep
-
-
-@pytest.mark.issue(3526)
-def test_issue_3526_2(en_vocab):
-    patterns = [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-    ]
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
-    new_ruler = EntityRuler(nlp)
-    new_ruler = new_ruler.from_bytes(bytes_old_style)
-    assert len(new_ruler) == len(ruler)
-    for pattern in ruler.patterns:
-        assert pattern in new_ruler.patterns
-    assert new_ruler.overwrite is not ruler.overwrite
-
-
-@pytest.mark.issue(3526)
-def test_issue_3526_3(en_vocab):
-    patterns = [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-    ]
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    with make_tempdir() as tmpdir:
-        out_file = tmpdir / "entity_ruler"
-        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
-        new_ruler = EntityRuler(nlp).from_disk(out_file)
-        for pattern in ruler.patterns:
-            assert pattern in new_ruler.patterns
-        assert len(new_ruler) == len(ruler)
-        assert new_ruler.overwrite is not ruler.overwrite
 
 
 @pytest.mark.issue(3526)
@@ -156,16 +109,14 @@ def test_issue_3526_4(en_vocab):
         nlp.to_disk(tmpdir)
         ruler = nlp.get_pipe("entity_ruler")
         assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert ruler.overwrite is True
         nlp2 = load(tmpdir)
         new_ruler = nlp2.get_pipe("entity_ruler")
         assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert new_ruler.overwrite is True
 
 
 @pytest.mark.issue(4042)
 def test_issue4042():
-    """Test that serialization of an EntityRuler before NER works fine."""
+    """Test that serialization of an entity_ruler before NER works fine."""
     nlp = English()
     # add ner pipe
     ner = nlp.add_pipe("ner")
diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx
index a35b6e2566c..7976e7725e0 100644
--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@@ -1,13 +1,24 @@
 ---
 title: EntityRuler
-tag: class
-source: spacy/pipeline/entity_ruler.py
 new: 2.1
 teaser: 'Pipeline component for rule-based named entity recognition'
 api_string_name: entity_ruler
 api_trainable: false
 ---
 
+<Infobox title="New in v4" variant="warning">
+
+As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is
+implemented as a special case of the `SpanRuler` component.
+
+See the [migration guide](#migrating) below for differences between the v3
+`EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler`
+component.
+
+See the [`SpanRuler`](/api/spanruler) API docs for the full API.
+
+</Infobox>
+
 The entity ruler lets you add spans to the [`Doc.ents`](/api/doc#ents) using
 token-based rules or exact phrase matches. It can be combined with the
 statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or
@@ -64,273 +75,51 @@ how the component should be configured. You can override its settings via the
 | `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                       |
 | `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                 |
 
-```python
-%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py
-```
-
-## EntityRuler.\_\_init\_\_ {id="init",tag="method"}
-
-Initialize the entity ruler. If patterns are supplied here, they need to be a
-list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either
-be a token pattern (list) or a phrase pattern (string). For example:
-`{"label": "ORG", "pattern": "Apple"}`.
-
-> #### Example
->
-> ```python
-> # Construction via add_pipe
-> ruler = nlp.add_pipe("entity_ruler")
->
-> # Construction from class
-> from spacy.pipeline import EntityRuler
-> ruler = EntityRuler(nlp, overwrite_ents=True)
-> ```
-
-| Name                                                 | Description                                                                                                                                                                                                                           |
-| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `nlp`                                                | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~                                                                                                                                     |
-| `name` <Tag variant="new">3</Tag>                    | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
-| _keyword-only_                                       |                                                                                                                                                                                                                                       |
-| `phrase_matcher_attr`                                | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~                                         |
-| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~                                                                                                     |
-| `validate`                                           | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~                                                                                                                |
-| `overwrite_ents`                                     | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~                                                                                             |
-| `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                                                               |
-| `patterns`                                           | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~                                                                                                                                 |
-| `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                                                         |
-
-## EntityRuler.initialize {id="initialize",tag="method",version="3"}
-
-Initialize the component with data and used before training to load in rules
-from a [pattern file](/usage/rule-based-matching/#entityruler-files). This
-method is typically called by [`Language.initialize`](/api/language#initialize)
-and lets you customize arguments it receives via the
-[`[initialize.components]`](/api/data-formats#config-initialize) block in the
-config.
-
-> #### Example
->
-> ```python
-> entity_ruler = nlp.add_pipe("entity_ruler")
-> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
-> ```
->
-> ```ini
-> ### config.cfg
-> [initialize.components.entity_ruler]
->
-> [initialize.components.entity_ruler.patterns]
-> @readers = "srsly.read_jsonl.v1"
-> path = "corpus/entity_ruler_patterns.jsonl
-> ```
-
-| Name           | Description                                                                                                                                                          |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
-| _keyword-only_ |                                                                                                                                                                      |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                 |
-| `patterns`     | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~                                                        |
-
-## EntityRuler.\_\_len\_\_ {id="len",tag="method"}
-
-The number of all patterns added to the entity ruler.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> assert len(ruler) == 0
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-> assert len(ruler) == 1
-> ```
-
-| Name        | Description                     |
-| ----------- | ------------------------------- |
-| **RETURNS** | The number of patterns. ~~int~~ |
-
-## EntityRuler.\_\_contains\_\_ {id="contains",tag="method"}
-
-Whether a label is present in the patterns.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-> assert "ORG" in ruler
-> assert not "PERSON" in ruler
-> ```
-
-| Name        | Description                                           |
-| ----------- | ----------------------------------------------------- |
-| `label`     | The label to check. ~~str~~                           |
-| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ |
-
-## EntityRuler.\_\_call\_\_ {id="call",tag="method"}
-
-Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
-happens automatically after the component has been added to the pipeline using
-[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
-with `overwrite_ents=True`, existing entities will be replaced if they overlap
-with the matches. When matches overlap in a Doc, the entity ruler prioritizes
-longer patterns over shorter, and if equal the match occuring first in the Doc
-is chosen.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
->
-> doc = nlp("A text about Apple.")
-> ents = [(ent.text, ent.label_) for ent in doc.ents]
-> assert ents == [("Apple", "ORG")]
-> ```
-
-| Name        | Description                                                          |
-| ----------- | -------------------------------------------------------------------- |
-| `doc`       | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
-| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~        |
+## Migrating from v3 {#migrating}
 
-## EntityRuler.add_patterns {id="add_patterns",tag="method"}
+### Loading patterns
 
-Add patterns to the entity ruler. A pattern can either be a token pattern (list
-of dicts) or a phrase pattern (string). For more details, see the usage guide on
-[rule-based matching](/usage/rule-based-matching).
+Unlike the v3 `EntityRuler`, the `SpanRuler` cannot load patterns on
+initialization with `SpanRuler(patterns=patterns)` or directly from a JSONL file
+path with `SpanRuler.from_disk(jsonl_path)`. Patterns should be loaded from the
+JSONL file separately and then added through
+[`SpanRuler.initialize`](/api/spanruler#initialize]) or
+[`SpanRuler.add_patterns`](/api/spanruler#add_patterns).
 
-> #### Example
->
-> ```python
-> patterns = [
->     {"label": "ORG", "pattern": "Apple"},
->     {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}
-> ]
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns(patterns)
-> ```
-
-| Name       | Description                                                      |
-| ---------- | ---------------------------------------------------------------- |
-| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
-
-## EntityRuler.remove {id="remove",tag="method",version="3.2.1"}
-
-Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if
-the ID does not exist.
-
-> #### Example
->
-> ```python
-> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}]
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns(patterns)
-> ruler.remove("apple")
-> ```
-
-| Name | Description                         |
-| ---- | ----------------------------------- |
-| `id` | The ID of the pattern rule. ~~str~~ |
-
-## EntityRuler.to_disk {id="to_disk",tag="method"}
-
-Save the entity ruler patterns to a directory. The patterns will be saved as
-newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided,
-only the patterns are saved as JSONL. If a directory name is provided, a
-`patterns.jsonl` and `cfg` file with the component configuration is exported.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.to_disk("/path/to/patterns.jsonl")  # saves patterns only
-> ruler.to_disk("/path/to/entity_ruler")    # saves patterns and config
-> ```
-
-| Name   | Description                                                                                                                                              |
-| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-
-## EntityRuler.from_disk {id="from_disk",tag="method"}
-
-Load the entity ruler from a path. Expects either a file containing
-newline-delimited JSON (JSONL) with one entry per line, or a directory
-containing a `patterns.jsonl` file and a `cfg` file with the component
-configuration.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.from_disk("/path/to/patterns.jsonl")  # loads patterns only
-> ruler.from_disk("/path/to/entity_ruler")    # loads patterns and config
-> ```
-
-| Name        | Description                                                                                                   |
-| ----------- | ------------------------------------------------------------------------------------------------------------- |
-| `path`      | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~                                                            |
-
-## EntityRuler.to_bytes {id="to_bytes",tag="method"}
-
-Serialize the entity ruler patterns to a bytestring.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler_bytes = ruler.to_bytes()
-> ```
-
-| Name        | Description                        |
-| ----------- | ---------------------------------- |
-| **RETURNS** | The serialized patterns. ~~bytes~~ |
-
-## EntityRuler.from_bytes {id="from_bytes",tag="method"}
-
-Load the pipe from a bytestring. Modifies the object in place and returns it.
-
-> #### Example
->
-> ```python
-> ruler_bytes = ruler.to_bytes()
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.from_bytes(ruler_bytes)
-> ```
-
-| Name         | Description                                        |
-| ------------ | -------------------------------------------------- |
-| `bytes_data` | The bytestring to load. ~~bytes~~                  |
-| **RETURNS**  | The modified `EntityRuler` object. ~~EntityRuler~~ |
-
-## EntityRuler.labels {id="labels",tag="property"}
-
-All labels present in the match patterns.
-
-| Name        | Description                            |
-| ----------- | -------------------------------------- |
-| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.from_disk("patterns.jsonl")
++ import srsly
++ patterns = srsly.read_jsonl("patterns.jsonl")
++ ruler.add_patterns(patterns)
+```
 
-## EntityRuler.ent_ids {id="ent_ids",tag="property",version="2.2.2"}
+### Saving patterns
 
-All entity IDs present in the `id` properties of the match patterns.
+`SpanRuler.to_disk` always saves the full component data to a directory and does
+not include an option to save the patterns to a single JSONL file.
 
-| Name        | Description                         |
-| ----------- | ----------------------------------- |
-| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.to_disk("patterns.jsonl")
++ import srsly
++ srsly.write_jsonl("patterns.jsonl", ruler.patterns)
+```
 
-## EntityRuler.patterns {id="patterns",tag="property"}
+### Accessing token and phrase patterns
 
-Get all patterns that were added to the entity ruler.
+The separate token patterns and phrase patterns are no longer accessible under
+`ruler.token_patterns` or `ruler.phrase_patterns`. You can access the combined
+patterns in their original format using the property
+[`SpanRuler.patterns`](/api/spanruler#patterns).
 
-| Name        | Description                                                                              |
-| ----------- | ---------------------------------------------------------------------------------------- |
-| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ |
+### Removing patterns by ID
 
-## Attributes {id="attributes"}
+[`SpanRuler.remove`](/api/spanruler#remove) removes by label rather than ID. To
+remove by ID, use [`SpanRuler.remove_by_id`](/api/spanruler#remove_by_id):
 
-| Name              | Description                                                                                                           |
-| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `matcher`         | The underlying matcher used to process token patterns. ~~Matcher~~                                                    |
-| `phrase_matcher`  | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~                                      |
-| `token_patterns`  | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
-| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~                             |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.remove("id")
++ ruler.remove_by_id("id")
+```
diff --git a/website/docs/api/spanruler.mdx b/website/docs/api/spanruler.mdx
index 5889b1906ad..1b6c558acef 100644
--- a/website/docs/api/spanruler.mdx
+++ b/website/docs/api/spanruler.mdx
@@ -13,7 +13,18 @@ The span ruler lets you add spans to [`Doc.spans`](/api/doc#spans) and/or
 usage examples, see the docs on
 [rule-based span matching](/usage/rule-based-matching#spanruler).
 
-## Assigned Attributes {id="assigned-attributes"}
+<Infobox title="Replacement of the EntityRuler" variant="warning">
+
+As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is
+implemented as a special case of the `SpanRuler` component.
+
+See the [migration guide](/api/entityruler#migrating) for differences between
+the v3 `EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler`
+component.
+
+</Infobox>
+
+## Assigned Attributes {#assigned-attributes}
 
 Matches will be saved to `Doc.spans[spans_key]` as a
 [`SpanGroup`](/api/spangroup) and/or to `Doc.ents`, where the annotation is
diff --git a/website/docs/usage/101/_architecture.mdx b/website/docs/usage/101/_architecture.mdx
index 2a63a3741fa..35c36088ab9 100644
--- a/website/docs/usage/101/_architecture.mdx
+++ b/website/docs/usage/101/_architecture.mdx
@@ -41,25 +41,27 @@ components for different language processing tasks and also allows adding
 
 ![The processing pipeline](/images/pipeline.svg)
 
-| Name                                            | Description                                                                                 |
-| ----------------------------------------------- | ------------------------------------------------------------------------------------------- |
-| [`AttributeRuler`](/api/attributeruler)         | Set token attributes using matcher rules.                                                   |
-| [`DependencyParser`](/api/dependencyparser)     | Predict syntactic dependencies.                                                             |
-| [`EditTreeLemmatizer`](/api/edittreelemmatizer) | Predict base forms of words.                                                                |
-| [`EntityLinker`](/api/entitylinker)             | Disambiguate named entities to nodes in a knowledge base.                                   |
-| [`EntityRecognizer`](/api/entityrecognizer)     | Predict named entities, e.g. persons or products.                                           |
-| [`EntityRuler`](/api/entityruler)               | Add entity spans to the `Doc` using token-based rules or exact phrase matches.              |
-| [`Lemmatizer`](/api/lemmatizer)                 | Determine the base forms of words using rules and lookups.                                  |
-| [`Morphologizer`](/api/morphologizer)           | Predict morphological features and coarse-grained part-of-speech tags.                      |
-| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries.                                                                |
-| [`Sentencizer`](/api/sentencizer)               | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
-| [`Tagger`](/api/tagger)                         | Predict part-of-speech tags.                                                                |
-| [`TextCategorizer`](/api/textcategorizer)       | Predict categories or labels over the whole document.                                       |
-| [`Tok2Vec`](/api/tok2vec)                       | Apply a "token-to-vector" model and set its outputs.                                        |
-| [`Tokenizer`](/api/tokenizer)                   | Segment raw text and create `Doc` objects from the words.                                   |
-| [`TrainablePipe`](/api/pipe)                    | Class that all trainable pipeline components inherit from.                                  |
-| [`Transformer`](/api/transformer)               | Use a transformer model and set its outputs.                                                |
-| [Other functions](/api/pipeline-functions)      | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                  |
+| Component name         | Component class                                      | Description                                                                                 |
+| ---------------------- | ---------------------------------------------------- | ------------------------------------------------------------------------------------------- |
+| `attribute_ruler`      | [`AttributeRuler`](/api/attributeruler)              | Set token attributes using matcher rules.                                                   |
+| `entity_linker`        | [`EntityLinker`](/api/entitylinker)                  | Disambiguate named entities to nodes in a knowledge base.                                   |
+| `entity_ruler`         | [`SpanRuler`](/api/spanruler)                        | Add entity spans to the `Doc` using token-based rules or exact phrase matches.              |
+| `lemmatizer`           | [`Lemmatizer`](/api/lemmatizer)                      | Determine the base forms of words using rules and lookups.                                  |
+| `morphologizer`        | [`Morphologizer`](/api/morphologizer)                | Predict morphological features and coarse-grained part-of-speech tags.                      |
+| `ner`                  | [`EntityRecognizer`](/api/entityrecognizer)          | Predict named entities, e.g. persons or products.                                           |
+| `parser`               | [`DependencyParser`](/api/dependencyparser)          | Predict syntactic dependencies.                                                             |
+| `senter`               | [`SentenceRecognizer`](/api/sentencerecognizer)      | Predict sentence boundaries.                                                                |
+| `sentencizer`          | [`Sentencizer`](/api/sentencizer)                    | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
+| `span_ruler`           | [`SpanRuler`](/api/spanruler)                        | Add spans to the `Doc` using token-based rules or exact phrase matches.                     |
+| `tagger`               | [`Tagger`](/api/tagger)                              | Predict part-of-speech tags.                                                                |
+| `textcat`              | [`TextCategorizer`](/api/textcategorizer)            | Predict exactly one category or label over a whole document.                                |
+| `textcat_multilabel`   | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Predict 0, 1 or more categories or labels over a whole document.                            |
+| `tok2vec`              | [`Tok2Vec`](/api/tok2vec)                            | Apply a "token-to-vector" model and set its outputs.                                        |
+| `tokenizer`            | [`Tokenizer`](/api/tokenizer)                        | Segment raw text and create `Doc` objects from the words.                                   |
+| `trainable_lemmatizer` | [`EditTreeLemmatizer`](/api/edittreelemmatizer)      | Predict base forms of words.                                                                |
+| `transformer`          | [`Transformer`](/api/transformer)                    | Use a transformer model and set its outputs.                                                |
+| -                      | [`TrainablePipe`](/api/pipe)                         | Class that all trainable pipeline components inherit from.                                  |
+| -                      | [Other functions](/api/pipeline-functions)           | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                  |
 
 ### Matchers {id="architecture-matchers"}
 
diff --git a/website/docs/usage/101/_pipelines.mdx b/website/docs/usage/101/_pipelines.mdx
index 315291762ff..e5a08c5e424 100644
--- a/website/docs/usage/101/_pipelines.mdx
+++ b/website/docs/usage/101/_pipelines.mdx
@@ -51,9 +51,9 @@ example, a custom lemmatizer may need the part-of-speech tags assigned, so it'll
 only work if it's added after the tagger. The parser will respect pre-defined
 sentence boundaries, so if a previous component in the pipeline sets them, its
 dependency predictions may be different. Similarly, it matters if you add the
-[`EntityRuler`](/api/entityruler) before or after the statistical entity
-recognizer: if it's added before, the entity recognizer will take the existing
-entities into account when making predictions. The
+[`SpanRuler`](/api/spanruler) before or after the statistical entity recognizer:
+if it's added before and it is writing to `doc.ents`, then the entity recognizer
+will take those existing entities into account when making predictions. The
 [`EntityLinker`](/api/entitylinker), which resolves named entities to knowledge
 base IDs, should be preceded by a pipeline component that recognizes entities
 such as the [`EntityRecognizer`](/api/entityrecognizer).
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index 3e58b251dec..ec93aee2cf3 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -297,13 +297,14 @@ available pipeline components and component functions.
 > ruler = nlp.add_pipe("entity_ruler")
 > ```
 
-| String name            | Component                                            | Description                                                                               |
+| Component name         | Component class                                      | Description                                                                               |
 | ---------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- |
 | `tagger`               | [`Tagger`](/api/tagger)                              | Assign part-of-speech-tags.                                                               |
 | `parser`               | [`DependencyParser`](/api/dependencyparser)          | Assign dependency labels.                                                                 |
 | `ner`                  | [`EntityRecognizer`](/api/entityrecognizer)          | Assign named entities.                                                                    |
 | `entity_linker`        | [`EntityLinker`](/api/entitylinker)                  | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
-| `entity_ruler`         | [`EntityRuler`](/api/entityruler)                    | Assign named entities based on pattern rules and dictionaries.                            |
+| `span_ruler`           | [`SpanRuler`](/api/spanruler)                        | Assign spans based on pattern rules and dictionaries.                                     |
+| `entity_ruler`         | [`SpanRuler`](/api/spanruler)                        | Assign named entities based on pattern rules and dictionaries.                            |
 | `textcat`              | [`TextCategorizer`](/api/textcategorizer)            | Assign text categories: exactly one category is predicted per document.                   |
 | `textcat_multilabel`   | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document.   |
 | `lemmatizer`           | [`Lemmatizer`](/api/lemmatizer)                      | Assign base forms to words using rules and lookups.                                       |
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index c90172b4325..86220440991 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -511,7 +511,7 @@ matches = matcher(doc)
 ```
 
 A very similar logic has been implemented in the built-in
-[`EntityRuler`](/api/entityruler) by the way. It also takes care of handling
+[`entity_ruler`](/api/entityruler) by the way. It also takes care of handling
 overlapping matches, which you would otherwise have to take care of yourself.
 
 > #### Tip: Visualizing matches
@@ -1305,7 +1305,7 @@ of patterns such as `{}` that match any token in the sentence.
 
 ## Rule-based entity recognition {id="entityruler",version="2.1"}
 
-The [`EntityRuler`](/api/entityruler) is a component that lets you add named
+The [`entity_ruler`](/api/entityruler) is a component that lets you add named
 entities based on pattern dictionaries, which makes it easy to combine
 rule-based and statistical named entity recognition for even more powerful
 pipelines.
@@ -1330,13 +1330,12 @@ pattern. The entity ruler accepts two types of patterns:
 
 ### Using the entity ruler {id="entityruler-usage"}
 
-The [`EntityRuler`](/api/entityruler) is a pipeline component that's typically
-added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is
-called on a text, it will find matches in the `doc` and add them as entities to
-the `doc.ents`, using the specified pattern label as the entity label. If any
-matches were to overlap, the pattern matching most tokens takes priority. If
-they also happen to be equally long, then the match occurring first in the `Doc`
-is chosen.
+The `entity_ruler` is a pipeline component that's typically added via
+[`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is called on a
+text, it will find matches in the `doc` and add them as entities to `doc.ents`,
+using the specified pattern label as the entity label. If any matches were to
+overlap, the pattern matching most tokens takes priority. If they also happen to
+be equally long, then the match occurring first in the `Doc` is chosen.
 
 ```python {executable="true"}
 from spacy.lang.en import English
@@ -1372,7 +1371,7 @@ doc = nlp("MyCorp Inc. is a company in the U.S.")
 print([(ent.text, ent.label_) for ent in doc.ents])
 ```
 
-#### Validating and debugging EntityRuler patterns {id="entityruler-pattern-validation",version="2.1.8"}
+#### Validating and debugging entity ruler patterns {#entityruler-pattern-validation new="2.1.8"}
 
 The entity ruler can validate patterns against a JSON schema with the config
 setting `"validate"`. See details under
@@ -1384,9 +1383,9 @@ ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
 
 ### Adding IDs to patterns {id="entityruler-ent-ids",version="2.2.2"}
 
-The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each
-pattern. Using the `id` attribute allows multiple patterns to be associated with
-the same entity.
+The [`entity_ruler`](/api/entityruler) can also accept an `id` attribute for
+each pattern. Using the `id` attribute allows multiple patterns to be associated
+with the same entity.
 
 ```python {executable="true"}
 from spacy.lang.en import English
@@ -1405,10 +1404,10 @@ doc2 = nlp("Apple is opening its first big office in San Fran.")
 print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
 ```
 
-If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
-patterns, the `id_` property of the matched entity is set to the `id` given
-in the patterns. So in the example above it's easy to identify that "San
-Francisco" and "San Fran" are both the same entity.
+If the `id` attribute is included in the [`entity_ruler`](/api/entityruler)
+patterns, the `id_` property of the matched entity is set to the `id` given in
+the patterns. So in the example above it's easy to identify that "San Francisco"
+and "San Fran" are both the same entity.
 
 ### Using pattern files {id="entityruler-files"}
 
@@ -1431,13 +1430,13 @@ new_ruler = nlp.add_pipe("entity_ruler").from_disk("./patterns.jsonl")
 
 If you're using the [Prodigy](https://prodi.gy) annotation tool, you might
 recognize these pattern files from bootstrapping your named entity and text
-classification labelling. The patterns for the `EntityRuler` follow the same
+classification labelling. The patterns for the `entity_ruler` follow the same
 syntax, so you can use your existing Prodigy pattern files in spaCy, and vice
 versa.
 
 </Infobox>
 
-When you save out an `nlp` object that has an `EntityRuler` added to its
+When you save out an `nlp` object that has an `entity_ruler` added to its
 pipeline, its patterns are automatically exported to the pipeline directory:
 
 ```python
@@ -1460,9 +1459,9 @@ rules included!
 
 When using a large amount of **phrase patterns** (roughly > 10000) it's useful
 to understand how the `add_patterns` function of the entity ruler works. For
-each **phrase pattern**, the EntityRuler calls the nlp object to construct a doc
-object. This happens in case you try to add the EntityRuler at the end of an
-existing pipeline with, for example, a POS tagger and want to extract matches
+each **phrase pattern**, the entity ruler calls the nlp object to construct a
+doc object. This happens in case you try to add the entity ruler at the end of
+an existing pipeline with, for example, a POS tagger and want to extract matches
 based on the pattern's POS signature. In this case you would pass a config value
 of `"phrase_matcher_attr": "POS"` for the entity ruler.
 
diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx
index b44bd86ed06..97ae3c5e573 100644
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@@ -187,13 +187,13 @@ the data to and from a JSON file.
 
 > #### Real-world example
 >
-> To see custom serialization methods in action, check out the new
-> [`EntityRuler`](/api/entityruler) component and its
-> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the
+> To see custom serialization methods in action, check out the
+> [`SpanRuler`](/api/spanruler) component and its
+> [source](%%GITHUB_SPACY/spacy/pipeline/span_ruler.py). Patterns added to the
 > component will be saved to a `.jsonl` file if the pipeline is serialized to
 > disk, and to a bytestring if the pipeline is serialized to bytes. This allows
-> saving out a pipeline with a rule-based entity recognizer and including all
-> rules _with_ the component data.
+> saving out a pipeline with rule-based components _with_ all the component
+> data.
 
 ```python {highlight="16-23,25-30"}
 import json
diff --git a/website/docs/usage/training.mdx b/website/docs/usage/training.mdx
index abb1b9cfd91..eda3f355f1a 100644
--- a/website/docs/usage/training.mdx
+++ b/website/docs/usage/training.mdx
@@ -421,7 +421,7 @@ your components during training, and the most common scenarios are:
 2. Update an existing **trained component** with more examples.
 3. Include an existing trained component without updating it.
 4. Include a non-trainable component, like a rule-based
-   [`EntityRuler`](/api/entityruler) or [`Sentencizer`](/api/sentencizer), or a
+   [`SpanRuler`](/api/spanruler) or [`Sentencizer`](/api/sentencizer), or a
    fully [custom component](/usage/processing-pipelines#custom-components).
 
 If a component block defines a `factory`, spaCy will look it up in the

From 932f53a733f860ccf784dc7589bae5d1b9ff002e Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Wed, 23 Nov 2022 13:09:32 +0100
Subject: [PATCH 021/504] Remove sentiment extension (#11722)

* remove sentiment attribute

* remove sentiment from docs

* add test for backwards compatibility

* replace from_disk with from_bytes

* Fix docs and format file

* Fix formatting
---
 spacy/lexeme.pyi                            |   1 -
 spacy/lexeme.pyx                            |  13 --
 spacy/tests/README.md                       |   2 +-
 spacy/tests/doc/test_doc_api.py             |  13 +-
 spacy/tests/doc/test_span.py                |  25 ---
 spacy/tests/matcher/test_matcher_api.py     |   3 -
 spacy/tokens/doc.pxd                        |   2 -
 spacy/tokens/doc.pyi                        |   1 -
 spacy/tokens/doc.pyx                        |   5 -
 spacy/tokens/span.pyi                       |   2 -
 spacy/tokens/span.pyx                       |  10 --
 spacy/tokens/token.pyi                      |   2 -
 spacy/tokens/token.pyx                      |   8 -
 website/docs/api/doc.mdx                    |   2 -
 website/docs/api/lexeme.md                  | 163 ++++++++++++++++++++
 website/docs/api/span.mdx                   |   1 -
 website/docs/api/token.mdx                  |   1 -
 website/docs/usage/processing-pipelines.mdx |   2 +-
 website/docs/usage/rule-based-matching.mdx  |  16 +-
 19 files changed, 185 insertions(+), 87 deletions(-)
 create mode 100644 website/docs/api/lexeme.md

diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi
index 9980b9fcefa..fb937d7b998 100644
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@@ -19,7 +19,6 @@ class Lexeme:
     def vector_norm(self) -> float: ...
     vector: Floats1d
     rank: int
-    sentiment: float
     @property
     def orth_(self) -> str: ...
     @property
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index f803d5e9394..3e63afa34ba 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -191,19 +191,6 @@ cdef class Lexeme:
         def __set__(self, value):
             self.c.id = value
 
-    property sentiment:
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the lexeme."""
-        def __get__(self):
-            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
-            return sentiment_table.get(self.c.orth, 0.0)
-
-        def __set__(self, float x):
-            if "lexeme_sentiment" not in self.vocab.lookups:
-                self.vocab.lookups.add_table("lexeme_sentiment")
-            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
-            sentiment_table[self.c.orth] = x
-
     @property
     def orth_(self):
         """RETURNS (str): The original verbatim text of the lexeme
diff --git a/spacy/tests/README.md b/spacy/tests/README.md
index 82fabcc778b..f3c96a39e7c 100644
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@@ -40,7 +40,7 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
 
 To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:
 
-- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
+- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email`.
 - If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
 - Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
 - Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 73544c51a4f..946910b29e1 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -389,9 +389,7 @@ def test_doc_api_serialize(en_tokenizer, text):
     assert [t.text for t in tokens] == [t.text for t in new_tokens]
     assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
 
-    new_tokens = Doc(tokens.vocab).from_bytes(
-        tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"]
-    )
+    new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
     assert tokens.text == new_tokens.text
     assert [t.text for t in tokens] == [t.text for t in new_tokens]
     assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
@@ -999,3 +997,12 @@ def test_doc_spans_setdefault(en_tokenizer):
     assert len(doc.spans["key2"]) == 1
     doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]]))
     assert len(doc.spans["key3"]) == 2
+
+
+def test_doc_sentiment_from_bytes_v3_to_v4():
+    """Test if a doc with sentiment attribute created in v3.x works with '.from_bytes' in v4.x without throwing errors. The sentiment attribute was removed in v4"""
+    doc_bytes = b"\x89\xa4text\xa5happy\xaaarray_head\x9fGQACKOLMN\xcd\x01\xc4\xcd\x01\xc6I\xcd\x01\xc5JP\xaaarray_body\x85\xc4\x02nd\xc3\xc4\x04type\xa3<u8\xc4\x04kind\xc4\x00\xc4\x05shape\x92\x01\x0f\xc4\x04data\xc4x\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xa4\x9a\xd3\x17\xca\xf0b\x03\xa4\x9a\xd3\x17\xca\xf0b\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\xa9sentiment\xcb?\xf0\x00\x00\x00\x00\x00\x00\xa6tensor\x85\xc4\x02nd\xc3\xc4\x04type\xa3<f4\xc4\x04kind\xc4\x00\xc4\x05shape\x91\x00\xc4\x04data\xc4\x00\xa4cats\x80\xa5spans\xc4\x01\x90\xa7strings\x92\xa0\xa5happy\xb2has_unknown_spaces\xc2"
+    doc = Doc(Vocab()).from_bytes(doc_bytes)
+    assert doc.text == "happy"
+    with pytest.raises(AttributeError):
+        doc.sentiment == 1.0
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index ab8538b17dc..74874624888 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -306,31 +306,6 @@ def test_span_similarity_match():
         assert span1[:1].similarity(doc.vocab["a"]) == 1.0
 
 
-def test_spans_default_sentiment(en_tokenizer):
-    """Test span.sentiment property's default averaging behaviour"""
-    text = "good stuff bad stuff"
-    tokens = en_tokenizer(text)
-    tokens.vocab[tokens[0].text].sentiment = 3.0
-    tokens.vocab[tokens[2].text].sentiment = -2.0
-    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
-    assert doc[:2].sentiment == 3.0 / 2
-    assert doc[-2:].sentiment == -2.0 / 2
-    assert doc[:-1].sentiment == (3.0 + -2) / 3.0
-
-
-def test_spans_override_sentiment(en_tokenizer):
-    """Test span.sentiment property's default averaging behaviour"""
-    text = "good stuff bad stuff"
-    tokens = en_tokenizer(text)
-    tokens.vocab[tokens[0].text].sentiment = 3.0
-    tokens.vocab[tokens[2].text].sentiment = -2.0
-    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
-    doc.user_span_hooks["sentiment"] = lambda span: 10.0
-    assert doc[:2].sentiment == 10.0
-    assert doc[-2:].sentiment == 10.0
-    assert doc[:-1].sentiment == 10.0
-
-
 def test_spans_are_hashable(en_tokenizer):
     """Test spans can be hashed."""
     text = "good stuff bad stuff"
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 106a00b3011..ecb4385dd90 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -51,8 +51,6 @@ def test_matcher_from_usage_docs(en_vocab):
 
     def label_sentiment(matcher, doc, i, matches):
         match_id, start, end = matches[i]
-        if doc.vocab.strings[match_id] == "HAPPY":
-            doc.sentiment += 0.1
         span = doc[start:end]
         with doc.retokenize() as retokenizer:
             retokenizer.merge(span)
@@ -62,7 +60,6 @@ def label_sentiment(matcher, doc, i, matches):
     matcher = Matcher(en_vocab)
     matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
     matcher(doc)
-    assert doc.sentiment != 0
     assert doc[1].norm_ == "happy emoji"
 
 
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 5e8975ed337..9fb6a72c87f 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -48,8 +48,6 @@ cdef class Doc:
 
     cdef TokenC* c
 
-    cdef public float sentiment
-
     cdef public dict activations
 
     cdef public dict user_hooks
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 5fda6f2f789..97c3f69f430 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -40,7 +40,6 @@ class Doc:
     spans: SpanGroups
     max_length: int
     length: int
-    sentiment: float
     activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
     cats: Dict[str, float]
     user_hooks: Dict[str, Callable[..., Any]]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 497656b6570..48def8c9544 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -279,7 +279,6 @@ cdef class Doc:
         self.c = data_start + PADDING
         self.max_length = size
         self.length = 0
-        self.sentiment = 0.0
         self.cats = {}
         self.activations = {}
         self.user_hooks = {}
@@ -1315,7 +1314,6 @@ cdef class Doc:
         other.tensor = copy.deepcopy(self.tensor)
         other.cats = copy.deepcopy(self.cats)
         other.user_data = copy.deepcopy(self.user_data)
-        other.sentiment = self.sentiment
         other.has_unknown_spaces = self.has_unknown_spaces
         other.user_hooks = dict(self.user_hooks)
         other.user_token_hooks = dict(self.user_token_hooks)
@@ -1415,7 +1413,6 @@ cdef class Doc:
             "text": lambda: self.text,
             "array_head": lambda: array_head,
             "array_body": lambda: self.to_array(array_head),
-            "sentiment": lambda: self.sentiment,
             "tensor": lambda: self.tensor,
             "cats": lambda: self.cats,
             "spans": lambda: self.spans.to_bytes(),
@@ -1451,8 +1448,6 @@ cdef class Doc:
             for key, value in zip(user_data_keys, user_data_values):
                 self.user_data[key] = value
         cdef int i, start, end, has_space
-        if "sentiment" not in exclude and "sentiment" in msg:
-            self.sentiment = msg["sentiment"]
         if "tensor" not in exclude and "tensor" in msg:
             self.tensor = msg["tensor"]
         if "cats" not in exclude and "cats" in msg:
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index a6731d1c2d4..ae4a6209e7e 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -89,8 +89,6 @@ class Span:
     @property
     def tensor(self) -> FloatsXd: ...
     @property
-    def sentiment(self) -> float: ...
-    @property
     def text(self) -> str: ...
     @property
     def text_with_ws(self) -> str: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index b212b4c4303..73f555747eb 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -594,16 +594,6 @@ cdef class Span:
             return None
         return self.doc.tensor[self.start : self.end]
 
-    @property
-    def sentiment(self):
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the span.
-        """
-        if "sentiment" in self.doc.user_span_hooks:
-            return self.doc.user_span_hooks["sentiment"](self)
-        else:
-            return sum([token.sentiment for token in self]) / len(self)
-
     @property
     def text(self):
         """RETURNS (str): The original verbatim text of the span."""
diff --git a/spacy/tokens/token.pyi b/spacy/tokens/token.pyi
index 435ace52707..5c3d4d0ba2b 100644
--- a/spacy/tokens/token.pyi
+++ b/spacy/tokens/token.pyi
@@ -78,8 +78,6 @@ class Token:
     @property
     def prob(self) -> float: ...
     @property
-    def sentiment(self) -> float: ...
-    @property
     def lang(self) -> int: ...
     @property
     def idx(self) -> int: ...
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index c0cd0af42c0..3a7ce45c54a 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -309,14 +309,6 @@ cdef class Token:
         """RETURNS (float): Smoothed log probability estimate of token type."""
         return self.vocab[self.c.lex.orth].prob
 
-    @property
-    def sentiment(self):
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the token."""
-        if "sentiment" in self.doc.user_token_hooks:
-            return self.doc.user_token_hooks["sentiment"](self)
-        return self.vocab[self.c.lex.orth].sentiment
-
     @property
     def lang(self):
         """RETURNS (uint64): ID of the language of the parent document's
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index 310ce0dc88d..28757cbc45f 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -762,7 +762,6 @@ The L2 norm of the document's vector representation.
 | `user_data`                                | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                               |
 | `lang` <Tag variant="new">2.1</Tag>        | Language of the document's vocabulary. ~~int~~                                                                                                 |
 | `lang_` <Tag variant="new">2.1</Tag>       | Language of the document's vocabulary. ~~str~~                                                                                                 |
-| `sentiment`                                | The document's positivity/negativity score, if available. ~~float~~                                                                            |
 | `user_hooks`                               | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                      |
 | `user_token_hooks`                         | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                              |
 | `user_span_hooks`                          | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                               |
@@ -786,7 +785,6 @@ serialization by passing in the string names via the `exclude` argument.
 | Name               | Description                                   |
 | ------------------ | --------------------------------------------- |
 | `text`             | The value of the `Doc.text` attribute.        |
-| `sentiment`        | The value of the `Doc.sentiment` attribute.   |
 | `tensor`           | The value of the `Doc.tensor` attribute.      |
 | `user_data`        | The value of the `Doc.user_data` dictionary.  |
 | `user_data_keys`   | The keys of the `Doc.user_data` dictionary.   |
diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md
new file mode 100644
index 00000000000..db1aba7aaec
--- /dev/null
+++ b/website/docs/api/lexeme.md
@@ -0,0 +1,163 @@
+---
+title: Lexeme
+teaser: An entry in the vocabulary
+tag: class
+source: spacy/lexeme.pyx
+---
+
+A `Lexeme` has no string context – it's a word type, as opposed to a word token.
+It therefore has no part-of-speech tag, dependency parse, or lemma (if
+lemmatization depends on the part-of-speech tag).
+
+## Lexeme.\_\_init\_\_ {#init tag="method"}
+
+Create a `Lexeme` object.
+
+| Name    | Description                        |
+| ------- | ---------------------------------- |
+| `vocab` | The parent vocabulary. ~~Vocab~~   |
+| `orth`  | The orth id of the lexeme. ~~int~~ |
+
+## Lexeme.set_flag {#set_flag tag="method"}
+
+Change the value of a boolean flag.
+
+> #### Example
+>
+> ```python
+> COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
+> nlp.vocab["spaCy"].set_flag(COOL_FLAG, True)
+> ```
+
+| Name      | Description                                  |
+| --------- | -------------------------------------------- |
+| `flag_id` | The attribute ID of the flag to set. ~~int~~ |
+| `value`   | The new value of the flag. ~~bool~~          |
+
+## Lexeme.check_flag {#check_flag tag="method"}
+
+Check the value of a boolean flag.
+
+> #### Example
+>
+> ```python
+> is_my_library = lambda text: text in ["spaCy", "Thinc"]
+> MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
+> assert nlp.vocab["spaCy"].check_flag(MY_LIBRARY) == True
+> ```
+
+| Name        | Description                                    |
+| ----------- | ---------------------------------------------- |
+| `flag_id`   | The attribute ID of the flag to query. ~~int~~ |
+| **RETURNS** | The value of the flag. ~~bool~~                |
+
+## Lexeme.similarity {#similarity tag="method" model="vectors"}
+
+Compute a semantic similarity estimate. Defaults to cosine over vectors.
+
+> #### Example
+>
+> ```python
+> apple = nlp.vocab["apple"]
+> orange = nlp.vocab["orange"]
+> apple_orange = apple.similarity(orange)
+> orange_apple = orange.similarity(apple)
+> assert apple_orange == orange_apple
+> ```
+
+| Name        | Description                                                                                                                      |
+| ----------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| other       | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ |
+| **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~                                                                     |
+
+## Lexeme.has_vector {#has_vector tag="property" model="vectors"}
+
+A boolean value indicating whether a word vector is associated with the lexeme.
+
+> #### Example
+>
+> ```python
+> apple = nlp.vocab["apple"]
+> assert apple.has_vector
+> ```
+
+| Name        | Description                                             |
+| ----------- | ------------------------------------------------------- |
+| **RETURNS** | Whether the lexeme has a vector data attached. ~~bool~~ |
+
+## Lexeme.vector {#vector tag="property" model="vectors"}
+
+A real-valued meaning representation.
+
+> #### Example
+>
+> ```python
+> apple = nlp.vocab["apple"]
+> assert apple.vector.dtype == "float32"
+> assert apple.vector.shape == (300,)
+> ```
+
+| Name        | Description                                                                                      |
+| ----------- | ------------------------------------------------------------------------------------------------ |
+| **RETURNS** | A 1-dimensional array representing the lexeme's vector. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
+
+## Lexeme.vector_norm {#vector_norm tag="property" model="vectors"}
+
+The L2 norm of the lexeme's vector representation.
+
+> #### Example
+>
+> ```python
+> apple = nlp.vocab["apple"]
+> pasta = nlp.vocab["pasta"]
+> apple.vector_norm  # 7.1346845626831055
+> pasta.vector_norm  # 7.759851932525635
+> assert apple.vector_norm != pasta.vector_norm
+> ```
+
+| Name        | Description                                         |
+| ----------- | --------------------------------------------------- |
+| **RETURNS** | The L2 norm of the vector representation. ~~float~~ |
+
+## Attributes {#attributes}
+
+| Name                                         | Description                                                                                                                                                                                                                                                          |
+| -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                      | The lexeme's vocabulary. ~~Vocab~~                                                                                                                                                                                                                                   |
+| `text`                                       | Verbatim text content. ~~str~~                                                                                                                                                                                                                                       |
+| `orth`                                       | ID of the verbatim text content. ~~int~~                                                                                                                                                                                                                             |
+| `orth_`                                      | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. ~~str~~                                                                                                                                                 |
+| `rank`                                       | Sequential ID of the lexeme's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                |
+| `flags`                                      | Container of the lexeme's binary flags. ~~int~~                                                                                                                                                                                                                      |
+| `norm`                                       | The lexeme's norm, i.e. a normalized form of the lexeme text. ~~int~~                                                                                                                                                                                                |
+| `norm_`                                      | The lexeme's norm, i.e. a normalized form of the lexeme text. ~~str~~                                                                                                                                                                                                |
+| `lower`                                      | Lowercase form of the word. ~~int~~                                                                                                                                                                                                                                  |
+| `lower_`                                     | Lowercase form of the word. ~~str~~                                                                                                                                                                                                                                  |
+| `shape`                                      | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
+| `shape_`                                     | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
+| `prefix`                                     | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~                                                                                                                                                                                            |
+| `prefix_`                                    | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~                                                                                                                                                                                            |
+| `suffix`                                     | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~                                                                                                                                                                                              |
+| `suffix_`                                    | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~                                                                                                                                                                                            |
+| `is_alpha`                                   | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~                                                                                                                                                                    |
+| `is_ascii`                                   | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~                                                                                                                                                     |
+| `is_digit`                                   | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~                                                                                                                                                                                   |
+| `is_lower`                                   | Is the lexeme in lowercase? Equivalent to `lexeme.text.islower()`. ~~bool~~                                                                                                                                                                                          |
+| `is_upper`                                   | Is the lexeme in uppercase? Equivalent to `lexeme.text.isupper()`. ~~bool~~                                                                                                                                                                                          |
+| `is_title`                                   | Is the lexeme in titlecase? Equivalent to `lexeme.text.istitle()`. ~~bool~~                                                                                                                                                                                          |
+| `is_punct`                                   | Is the lexeme punctuation? ~~bool~~                                                                                                                                                                                                                                  |
+| `is_left_punct`                              | Is the lexeme a left punctuation mark, e.g. `(`? ~~bool~~                                                                                                                                                                                                            |
+| `is_right_punct`                             | Is the lexeme a right punctuation mark, e.g. `)`? ~~bool~~                                                                                                                                                                                                           |
+| `is_space`                                   | Does the lexeme consist of whitespace characters? Equivalent to `lexeme.text.isspace()`. ~~bool~~                                                                                                                                                                    |
+| `is_bracket`                                 | Is the lexeme a bracket? ~~bool~~                                                                                                                                                                                                                                    |
+| `is_quote`                                   | Is the lexeme a quotation mark? ~~bool~~                                                                                                                                                                                                                             |
+| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the lexeme a currency symbol? ~~bool~~                                                                                                                                                                                                                            |
+| `like_url`                                   | Does the lexeme resemble a URL? ~~bool~~                                                                                                                                                                                                                             |
+| `like_num`                                   | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~                                                                                                                                                                                          |
+| `like_email`                                 | Does the lexeme resemble an email address? ~~bool~~                                                                                                                                                                                                                  |
+| `is_oov`                                     | Is the lexeme out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~                                                                                                                                                                                      |
+| `is_stop`                                    | Is the lexeme part of a "stop list"? ~~bool~~                                                                                                                                                                                                                        |
+| `lang`                                       | Language of the parent vocabulary. ~~int~~                                                                                                                                                                                                                           |
+| `lang_`                                      | Language of the parent vocabulary. ~~str~~                                                                                                                                                                                                                           |
+| `prob`                                       | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                 |
+| `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index 5e7495f17ca..1774a298ff2 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -568,5 +568,4 @@ overlaps with will be returned.
 | `ent_id_`                               | Alias for `id_`: the span's ID. ~~str~~                                                                                       |
 | `id`                                    | The hash value of the span's ID. ~~int~~                                                                                      |
 | `id_`                                   | The span's ID. ~~str~~                                                                                                        |
-| `sentiment`                             | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
 | `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
diff --git a/website/docs/api/token.mdx b/website/docs/api/token.mdx
index 12b99394350..16d421c12f4 100644
--- a/website/docs/api/token.mdx
+++ b/website/docs/api/token.mdx
@@ -470,7 +470,6 @@ The L2 norm of the token's vector representation.
 | `lang_`                                      | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
 | `prob`                                       | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
 | `idx`                                        | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
-| `sentiment`                                  | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
 | `lex_id`                                     | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `rank`                                       | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index ec93aee2cf3..c0fc4207046 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -1388,7 +1388,7 @@ separation and makes it easier to ensure backwards compatibility. For example,
 if you've implemented your own `.coref` property and spaCy claims it one day,
 it'll break your code. Similarly, just by looking at the code, you'll
 immediately know what's built-in and what's custom – for example,
-`doc.sentiment` is spaCy, while `doc._.sent_score` isn't.
+`doc.lang` is spaCy, while `doc._.language` isn't.
 
 </Accordion>
 
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 86220440991..8469d587ed1 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -811,6 +811,9 @@ whitespace, making them easy to match as well.
 ```python {executable="true"}
 from spacy.lang.en import English
 from spacy.matcher import Matcher
+from spacy.tokens import Doc
+
+Doc.set_extension("sentiment", default=0.0)
 
 nlp = English()  # We only want the tokenizer, so no need to load a pipeline
 matcher = Matcher(nlp.vocab)
@@ -826,9 +829,9 @@ neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]
 def label_sentiment(matcher, doc, i, matches):
     match_id, start, end = matches[i]
     if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
-        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
+        doc._.sentiment += 0.1  # Add 0.1 for positive sentiment
     elif doc.vocab.strings[match_id] == "SAD":
-        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
+        doc._.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
 
 matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)  # Add positive pattern
 matcher.add("SAD", neg_patterns, on_match=label_sentiment)  # Add negative pattern
@@ -857,17 +860,18 @@ is "Smiling Face With Heart-Eyes". Assigning it to a
 the emoji span will make it available as `span._.emoji_desc`.
 
 ```python
-import emoji  # Installation: pip install emoji
-from spacy.tokens import Span  # Get the global Span object
+from emojipedia import Emojipedia  # Installation: pip install emojipedia
+from spacy.tokens import Doc, Span  # Get the global Doc and Span object
 
 Span.set_extension("emoji_desc", default=None)  # Register the custom attribute
+Doc.set_extension("sentiment", default=0.0)
 
 def label_sentiment(matcher, doc, i, matches):
     match_id, start, end = matches[i]
     if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
-        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
+        doc._.sentiment += 0.1  # Add 0.1 for positive sentiment
     elif doc.vocab.strings[match_id] == "SAD":
-        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
+        doc._.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
     span = doc[start:end]
     # Verify if it is an emoji and set the extension attribute correctly.
     if emoji.is_emoji(span[0].text):

From f5d57892656c9381a4b0e720ce4fea0cc0a4cf9f Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 5 Dec 2022 08:57:24 +0100
Subject: [PATCH 022/504] prettier formatting

---
 website/docs/api/cli.mdx                    | 30 ++++++++++-----------
 website/docs/usage/processing-pipelines.mdx |  4 +--
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 950d98c1f68..47028f4a2e7 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1343,21 +1343,21 @@ be provided.
 > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
 > ```
 
-| Name                     | Description                                                                                                                                                                          |
-| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                  | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`              | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
-| `pipe_name`              | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
-| `threshold_key`          | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
-| `scores_key`             | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
-| `--n_trials`, `-n`       | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
-| `--code`, `-c`           | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gpu-id`, `-g`         | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--gold-preproc`, `-G`   | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                     |
-| `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-
-## assemble {id="assemble",tag="command"}
+| Name                    | Description                                                                                                                                                                          |
+| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                 | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
+| `data_path`             | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
+| `pipe_name`             | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
+| `threshold_key`         | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
+| `scores_key`            | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
+| `--n_trials`, `-n`      | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
+| `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gpu-id`, `-g`        | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--gold-preproc`, `-G`  | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
+| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+
+## assemble {#assemble tag="command"}
 
 Assemble a pipeline from a config file without additional training. Expects a
 [config file](/api/data-formats#config) with all settings and hyperparameters.
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index c0fc4207046..fb5de5da102 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -1387,8 +1387,8 @@ Writing to a `._` attribute instead of to the `Doc` directly keeps a clearer
 separation and makes it easier to ensure backwards compatibility. For example,
 if you've implemented your own `.coref` property and spaCy claims it one day,
 it'll break your code. Similarly, just by looking at the code, you'll
-immediately know what's built-in and what's custom – for example,
-`doc.lang` is spaCy, while `doc._.language` isn't.
+immediately know what's built-in and what's custom – for example, `doc.lang` is
+spaCy, while `doc._.language` isn't.
 
 </Accordion>
 

From 72d0f069f0a77c8211d8ff5319ad2220e725d7e5 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 5 Dec 2022 17:43:23 +0900
Subject: [PATCH 023/504] Switch ubuntu-latest to ubuntu-20.04 in main tests
 (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6
---
 azure-pipelines.yml | 103 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 azure-pipelines.yml

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
new file mode 100644
index 00000000000..0f7ea91f96f
--- /dev/null
+++ b/azure-pipelines.yml
@@ -0,0 +1,103 @@
+trigger:
+  batch: true
+  branches:
+    include:
+      - "*"
+    exclude:
+      - "spacy.io"
+      - "nightly.spacy.io"
+      - "v2.spacy.io"
+  paths:
+    exclude:
+      - "website/*"
+      - "*.md"
+      - ".github/workflows/*"
+pr:
+  paths:
+    exclude:
+      - "*.md"
+      - "website/docs/*"
+      - "website/src/*"
+      - ".github/workflows/*"
+
+jobs:
+  # Perform basic checks for most important errors (syntax etc.) Uses the config
+  # defined in .flake8 and overwrites the selected codes.
+  - job: "Validate"
+    pool:
+      vmImage: "ubuntu-latest"
+    steps:
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: "3.7"
+      - script: |
+          pip install flake8==5.0.4
+          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
+        displayName: "flake8"
+
+  - job: "Test"
+    dependsOn: "Validate"
+    strategy:
+      matrix:
+        # We're only running one platform per Python version to speed up builds
+        Python36Linux:
+          imageName: "ubuntu-20.04"
+          python.version: "3.6"
+        #        Python36Windows:
+        #          imageName: "windows-latest"
+        #          python.version: "3.6"
+        #        Python36Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.6"
+        #        Python37Linux:
+        #          imageName: "ubuntu-20.04"
+        #          python.version: "3.7"
+        Python37Windows:
+          imageName: "windows-latest"
+          python.version: "3.7"
+        #        Python37Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.7"
+        #        Python38Linux:
+        #          imageName: "ubuntu-latest"
+        #          python.version: "3.8"
+        #        Python38Windows:
+        #          imageName: "windows-latest"
+        #          python.version: "3.8"
+        Python38Mac:
+          imageName: "macos-latest"
+          python.version: "3.8"
+        Python39Linux:
+          imageName: "ubuntu-latest"
+          python.version: "3.9"
+        #        Python39Windows:
+        #          imageName: "windows-latest"
+        #          python.version: "3.9"
+        #        Python39Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.9"
+        #        Python310Linux:
+        #          imageName: "ubuntu-latest"
+        #          python.version: "3.10"
+        Python310Windows:
+          imageName: "windows-latest"
+          python.version: "3.10"
+        #        Python310Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.10"
+        Python311Linux:
+          imageName: 'ubuntu-latest'
+          python.version: '3.11'
+        Python311Windows:
+          imageName: 'windows-latest'
+          python.version: '3.11'
+        Python311Mac:
+          imageName: 'macos-latest'
+          python.version: '3.11'
+      maxParallel: 4
+    pool:
+      vmImage: $(imageName)
+    steps:
+      - template: .github/azure-steps.yml
+        parameters:
+          python_version: '$(python.version)'

From f84e471d226c7958bb95d5611caecf390554e4b2 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 8 Dec 2022 19:43:52 +0900
Subject: [PATCH 024/504] Remove all references to "begin_training" (#11943)

When v3 was released, `begin_training` was renamed to `initialize`.
There were warnings in the code and docs about that. This PR removes
them.
---
 spacy/errors.py                           |  7 -------
 spacy/language.py                         |  9 ---------
 spacy/pipeline/pipe.pyx                   |  7 -------
 spacy/tests/pipeline/test_pipe_methods.py | 11 -----------
 website/docs/api/dependencyparser.mdx     |  6 ------
 website/docs/api/entitylinker.mdx         |  6 ------
 website/docs/api/entityrecognizer.mdx     |  6 ------
 website/docs/api/language.mdx             |  9 ---------
 website/docs/api/pipe.mdx                 |  6 ------
 website/docs/api/tagger.mdx               |  6 ------
 website/docs/api/textcategorizer.mdx      |  6 ------
 11 files changed, 79 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 965c92066bc..454e71f987c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -132,13 +132,6 @@ class Warnings(metaclass=ErrorsWithCodes):
             "and make it independent. For example, `replace_listeners = "
             "[\"model.tok2vec\"]` See the documentation for details: "
             "https://spacy.io/usage/training#config-components-listeners")
-    W088 = ("The pipeline component {name} implements a `begin_training` "
-            "method, which won't be called by spaCy. As of v3.0, `begin_training` "
-            "has been renamed to `initialize`, so you likely want to rename the "
-            "component method. See the documentation for details: "
-            "https://spacy.io/api/language#initialize")
-    W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed "
-            "to `nlp.initialize`.")
     W090 = ("Could not locate any {format} files in path '{path}'.")
     W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
     W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
diff --git a/spacy/language.py b/spacy/language.py
index 18d20c93932..a47cc5df454 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1277,15 +1277,6 @@ def get_grads(key, W, dW):
             sgd(key, W, dW)  # type: ignore[call-arg, misc]
         return losses
 
-    def begin_training(
-        self,
-        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
-        *,
-        sgd: Optional[Optimizer] = None,
-    ) -> Optimizer:
-        warnings.warn(Warnings.W089, DeprecationWarning)
-        return self.initialize(get_examples, sgd=sgd)
-
     def initialize(
         self,
         get_examples: Optional[Callable[[], Iterable[Example]]] = None,
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 72ea7e45a80..ea5fc5253d9 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -21,13 +21,6 @@ cdef class Pipe:
     DOCS: https://spacy.io/api/pipe
     """
 
-    @classmethod
-    def __init_subclass__(cls, **kwargs):
-        """Raise a warning if an inheriting class implements 'begin_training'
-         (from v2) instead of the new 'initialize' method (from v3)"""
-        if hasattr(cls, "begin_training"):
-            warnings.warn(Warnings.W088.format(name=cls.__name__))
-
     def __call__(self, Doc doc) -> Doc:
         """Apply the pipe to one document. The document is modified in place,
         and returned. This usually happens under the hood when the nlp object
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 4dd7bae16c2..9b9786f0458 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -529,17 +529,6 @@ def test_pipe_label_data_no_labels(pipe):
         assert "labels" not in get_arg_names(initialize)
 
 
-def test_warning_pipe_begin_training():
-    with pytest.warns(UserWarning, match="begin_training"):
-
-        class IncompatPipe(TrainablePipe):
-            def __init__(self):
-                ...
-
-            def begin_training(*args, **kwargs):
-                ...
-
-
 def test_pipe_methods_initialize():
     """Test that the [initialize] config reflects the components correctly."""
     nlp = Language()
diff --git a/website/docs/api/dependencyparser.mdx b/website/docs/api/dependencyparser.mdx
index a6bc48cdf74..771a00aeee1 100644
--- a/website/docs/api/dependencyparser.mdx
+++ b/website/docs/api/dependencyparser.mdx
@@ -169,12 +169,6 @@ arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index 85b872151fd..238b62a2e6d 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -200,12 +200,6 @@ knowledge base. This argument should be a function that takes a `Vocab` instance
 and creates the `KnowledgeBase`, ensuring that the strings of the knowledge base
 are synced with the current vocab.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/entityrecognizer.mdx b/website/docs/api/entityrecognizer.mdx
index c80406a5b81..1f386bbb6ff 100644
--- a/website/docs/api/entityrecognizer.mdx
+++ b/website/docs/api/entityrecognizer.mdx
@@ -165,12 +165,6 @@ arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 068e8ea7885..d5fbae05ec4 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -259,15 +259,6 @@ either in the [config](/usage/training#config), or by calling
 [`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
 the tagger or textcat).
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`. It now also takes a
-**function** that is called with no arguments and returns a sequence of
-[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse`
-objects.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx
index c2777edf07e..b387ea58654 100644
--- a/website/docs/api/pipe.mdx
+++ b/website/docs/api/pipe.mdx
@@ -152,12 +152,6 @@ network,
 setting up the label scheme based on the data. This method is typically called
 by [`Language.initialize`](/api/language#initialize).
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index 20852e8eb94..ae14df212ee 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -142,12 +142,6 @@ arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/textcategorizer.mdx b/website/docs/api/textcategorizer.mdx
index a1dfb6dd88e..5db3a409255 100644
--- a/website/docs/api/textcategorizer.mdx
+++ b/website/docs/api/textcategorizer.mdx
@@ -187,12 +187,6 @@ arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python

From 52e67529e09f97ec0deb06540c5c1f2c1091199d Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 8 Dec 2022 19:45:52 +0900
Subject: [PATCH 025/504] Remove old model shortcuts (#11916)

* Remove old model shortcuts

* Remove error, docs warnings about shortcuts

* Fix import in util

Accidentally deleted the whole import and not just the old part...

* Change universe example to v3 style

* Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6

* Update some model loading in Universe

* Add v2 tag to neuralcoref

* Use the spacy-version feature instead of a v2 tag

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/cli/download.py         | 18 ++----------------
 spacy/errors.py               | 16 ----------------
 spacy/util.py                 |  4 +---
 website/UNIVERSE.md           |  2 +-
 website/docs/usage/models.mdx | 29 +----------------------------
 website/meta/universe.json    | 20 ++++++++++++--------
 6 files changed, 17 insertions(+), 72 deletions(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 4261fb830d9..f371d110319 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,16 +7,8 @@
 from wasabi import msg
 
 from .. import about
-from ..errors import OLD_MODEL_SHORTCUTS
-from ..util import (
-    get_minor_version,
-    is_in_interactive,
-    is_in_jupyter,
-    is_package,
-    is_prerelease_version,
-    run_command,
-)
-from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
+from ..util import is_package, get_minor_version, run_command
+from ..util import is_prerelease_version
 
 
 @app.command(
@@ -76,12 +68,6 @@ def download(
         version = components[-1]
     else:
         model_name = model
-        if model in OLD_MODEL_SHORTCUTS:
-            msg.warn(
-                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please "
-                f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
-            )
-            model_name = OLD_MODEL_SHORTCUTS[model]
         compatibility = get_compatibility()
         version = get_version(model_name, compatibility)
 
diff --git a/spacy/errors.py b/spacy/errors.py
index 454e71f987c..5f03d0eae94 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -732,13 +732,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "method in component '{name}'. If you want to use this "
             "method, make sure it's overwritten on the subclass.")
     E940 = ("Found NaN values in scores.")
-    E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
-            "model from a shortcut, which is obsolete as of spaCy v3.0. To "
-            "load the model, use its full name instead:\n\n"
-            "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
-            "models, see the models directory: https://spacy.io/models and if "
-            "you want to create a blank model, use spacy.blank: "
-            "nlp = spacy.blank(\"{name}\")")
     E942 = ("Executing `after_{name}` callback failed. Expected the function to "
             "return an initialized nlp object but got: {value}. Maybe "
             "you forgot to return the modified object in your function?")
@@ -986,15 +979,6 @@ class Errors(metaclass=ErrorsWithCodes):
              "but got '{received_type}'")
 
 
-# Deprecated model shortcuts, only used in errors and warnings
-OLD_MODEL_SHORTCUTS = {
-    "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
-    "pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
-    "nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
-    "lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
-}
-
-
 # fmt: on
 
 
diff --git a/spacy/util.py b/spacy/util.py
index 8068c4bcec9..463ac219bf5 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -66,7 +66,7 @@
 
 from .symbols import ORTH
 from .compat import cupy, CudaStream, is_windows, importlib_metadata
-from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
+from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, importlib_metadata, is_windows
 from .errors import OLD_MODEL_SHORTCUTS, Errors, Warnings
@@ -465,8 +465,6 @@ def load_model(
             return load_model_from_path(Path(name), **kwargs)  # type: ignore[arg-type]
     elif hasattr(name, "exists"):  # Path or Path-like to model data
         return load_model_from_path(name, **kwargs)  # type: ignore[arg-type]
-    if name in OLD_MODEL_SHORTCUTS:
-        raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name]))  # type: ignore[index]
     raise IOError(Errors.E050.format(name=name))
 
 
diff --git a/website/UNIVERSE.md b/website/UNIVERSE.md
index ac4e2e684fb..a9008086c95 100644
--- a/website/UNIVERSE.md
+++ b/website/UNIVERSE.md
@@ -61,7 +61,7 @@ use a linter to verify that your markup is correct.
         "import spacy",
         "import package_name",
         "",
-        "nlp = spacy.load('en')",
+        "nlp = spacy.load('en_core_web_sm')",
         "nlp.add_pipe(package_name)"
     ],
     "code_language": "python",
diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx
index 9213dead16b..e74c37e3080 100644
--- a/website/docs/usage/models.mdx
+++ b/website/docs/usage/models.mdx
@@ -337,23 +337,7 @@ The easiest way to download a trained pipeline is via spaCy's
 [`download`](/api/cli#download) command. It takes care of finding the
 best-matching package compatible with your spaCy installation.
 
-> #### Important note for v3.0
->
-> Note that as of spaCy v3.0, shortcut links like `en` that create (potentially
-> brittle) symlinks in your spaCy installation are **deprecated**. To download
-> and load an installed pipeline package, use its full name:
->
-> ```diff
-> - python -m spacy download en
-> + python -m spacy download en_core_web_sm
-> ```
->
-> ```diff
-> - nlp = spacy.load("en")
-> + nlp = spacy.load("en_core_web_sm")
-> ```
-
-```bash
+```cli
 # Download best-matching version of a package for your spaCy installation
 $ python -m spacy download en_core_web_sm
 
@@ -483,17 +467,6 @@ spacy.cli.download("en_core_web_sm")
 To load a pipeline package, use [`spacy.load`](/api/top-level#spacy.load) with
 the package name or a path to the data directory:
 
-> #### Important note for v3.0
->
-> Note that as of spaCy v3.0, shortcut links like `en` that create (potentially
-> brittle) symlinks in your spaCy installation are **deprecated**. To download
-> and load an installed pipeline package, use its full name:
->
-> ```diff
-> - python -m spacy download en
-> + python -m spacy download en_core_web_sm
-> ```
-
 ```python
 import spacy
 nlp = spacy.load("en_core_web_sm")           # load package "en_core_web_sm"
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 6278dd4899b..cb2386e1fb8 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1181,12 +1181,13 @@
             "author_links": {
                 "github": "mholtzscher"
             },
-            "category": ["pipeline"]
+            "category": ["pipeline"],
+            "spacy_version": 2
         },
         {
             "id": "spacy_cld",
             "title": "spaCy-CLD",
-            "slogan": "Add language detection to your spaCy pipeline using CLD2",
+            "slogan": "Add language detection to your spaCy v2 pipeline using CLD2",
             "description": "spaCy-CLD operates on `Doc` and `Span` spaCy objects. When called on a `Doc` or `Span`, the object is given two attributes: `languages` (a list of up to 3 language codes) and `language_scores` (a dictionary mapping language codes to confidence scores between 0 and 1).\n\nspacy-cld is a little extension that wraps the [PYCLD2](https://github.com/aboSamoor/pycld2) Python library, which in turn wraps the [Compact Language Detector 2](https://github.com/CLD2Owners/cld2) C library originally built at Google for the Chromium project. CLD2 uses character n-grams as features and a Naive Bayes classifier to identify 80+ languages from Unicode text strings (or XML/HTML). It can detect up to 3 different languages in a given document, and reports a confidence score (reported in with each language.",
             "github": "nickdavidhaynes/spacy-cld",
             "pip": "spacy_cld",
@@ -1206,7 +1207,8 @@
             "author_links": {
                 "github": "nickdavidhaynes"
             },
-            "category": ["pipeline"]
+            "category": ["pipeline"],
+            "spacy_version": 2
         },
         {
             "id": "spacy-iwnlp",
@@ -1280,7 +1282,8 @@
                 "github": "sammous"
             },
             "category": ["pipeline"],
-            "tags": ["pos", "lemmatizer", "french"]
+            "tags": ["pos", "lemmatizer", "french"],
+            "spacy_version": 2
         },
         {
             "id": "lemmy",
@@ -1474,8 +1477,8 @@
         },
         {
             "id": "neuralcoref",
-            "slogan": "State-of-the-art coreference resolution based on neural nets and spaCy",
-            "description": "This coreference resolution module is based on the super fast [spaCy](https://spacy.io/) parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. Since ✨Neuralcoref v2.0, you can train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**. Note that to use neuralcoref with spaCy > 2.1.0, you'll have to install neuralcoref from source.",
+            "slogan": "State-of-the-art coreference resolution based on neural nets and spaCy v2",
+            "description": "This coreference resolution module is based on the super fast spaCy parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. Since ✨Neuralcoref v2.0, you can train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**. Note that to use neuralcoref with spaCy > 2.1.0, you'll have to install neuralcoref from source, and v3+ is not supported.",
             "github": "huggingface/neuralcoref",
             "thumb": "https://i.imgur.com/j6FO9O6.jpg",
             "code_example": [
@@ -1496,7 +1499,8 @@
                 "github": "huggingface"
             },
             "category": ["standalone", "conversational", "models"],
-            "tags": ["coref"]
+            "tags": ["coref"],
+            "spacy_version": 2
         },
         {
             "id": "neuralcoref-vizualizer",
@@ -1572,7 +1576,7 @@
                 "import spacy",
                 "import explacy",
                 "",
-                "nlp = spacy.load('en')",
+                "nlp = spacy.load('en_core_web_sm')",
                 "explacy.print_parse_info(nlp, 'The salad was surprisingly tasty.')"
             ],
             "author": "Tyler Neylon",

From 195dc53b0032e07f94cd53a23b391222a261b1a9 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 8 Dec 2022 13:24:45 +0100
Subject: [PATCH 026/504] Remove unused, experimental multi-task components
 (#11919)

* Remove experimental multi-task components

These are incomplete implementations and are not usable in their current state.

* Remove orphaned error message

* Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6

* Revert "Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)"

This reverts commit 77c0fd7b176be80e8438fa21440a85d1fe26e39b.

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
---
 setup.py                     |   1 -
 spacy/errors.py              |   2 -
 spacy/pipeline/multitask.pyx | 215 -----------------------------------
 3 files changed, 218 deletions(-)
 delete mode 100644 spacy/pipeline/multitask.pyx

diff --git a/setup.py b/setup.py
index c9b4f7171e3..a80016ea9ea 100755
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,6 @@
     "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
-    "spacy.pipeline.multitask",
     "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
diff --git a/spacy/errors.py b/spacy/errors.py
index 5f03d0eae94..11b8980fd9d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -249,8 +249,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "https://spacy.io/usage/models")
     E011 = ("Unknown operator: '{op}'. Options: {opts}")
     E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
-    E016 = ("MultitaskObjective target should be function or one of: dep, "
-            "tag, ent, dep_tag_offset, ent_tag.")
     E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
     E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
             "refers to an issue with the `Vocab` or `StringStore`.")
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
deleted file mode 100644
index f33a90fde85..00000000000
--- a/spacy/pipeline/multitask.pyx
+++ /dev/null
@@ -1,215 +0,0 @@
-# cython: infer_types=True, binding=True
-from typing import Optional
-
-import numpy
-from thinc.api import Config, CosineDistance, Model, set_dropout_rate, to_categorical
-
-from ..attrs import ID
-from ..errors import Errors
-from ..language import Language
-from ..training import validate_examples
-from .tagger import Tagger
-from .trainable_pipe import TrainablePipe
-
-default_model_config = """
-[model]
-@architectures = "spacy.MultiTask.v1"
-maxout_pieces = 3
-token_vector_width = 96
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v2"
-pretrained_vectors = null
-width = 96
-depth = 4
-embed_size = 2000
-window_size = 1
-maxout_pieces = 2
-subword_features = true
-"""
-DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
-
-
-@Language.factory(
-    "nn_labeller",
-    default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
-)
-def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
-    return MultitaskObjective(nlp.vocab, model, name)
-
-
-class MultitaskObjective(Tagger):
-    """Experimental: Assist training of a parser or tagger, by training a
-    side-objective.
-    """
-
-    def __init__(self, vocab, model, name="nn_labeller", *, target):
-        self.vocab = vocab
-        self.model = model
-        self.name = name
-        if target == "dep":
-            self.make_label = self.make_dep
-        elif target == "tag":
-            self.make_label = self.make_tag
-        elif target == "ent":
-            self.make_label = self.make_ent
-        elif target == "dep_tag_offset":
-            self.make_label = self.make_dep_tag_offset
-        elif target == "ent_tag":
-            self.make_label = self.make_ent_tag
-        elif target == "sent_start":
-            self.make_label = self.make_sent_start
-        elif hasattr(target, "__call__"):
-            self.make_label = target
-        else:
-            raise ValueError(Errors.E016)
-        cfg = {"labels": {}, "target": target}
-        self.cfg = dict(cfg)
-
-    @property
-    def labels(self):
-        return self.cfg.setdefault("labels", {})
-
-    @labels.setter
-    def labels(self, value):
-        self.cfg["labels"] = value
-
-    def set_annotations(self, docs, dep_ids):
-        pass
-
-    def initialize(self, get_examples, nlp=None, labels=None):
-        if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
-            raise ValueError(err)
-        if labels is not None:
-            self.labels = labels
-        else:
-            for example in get_examples():
-                for token in example.y:
-                    label = self.make_label(token)
-                    if label is not None and label not in self.labels:
-                        self.labels[label] = len(self.labels)
-        self.model.initialize()   # TODO: fix initialization by defining X and Y
-
-    def predict(self, docs):
-        tokvecs = self.model.get_ref("tok2vec")(docs)
-        scores = self.model.get_ref("softmax")(tokvecs)
-        return tokvecs, scores
-
-    def get_loss(self, examples, scores):
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        for i, eg in enumerate(examples):
-            # Handles alignment for tokenization differences
-            _doc_annots = eg.get_aligned()  # TODO
-            for j in range(len(eg.predicted)):
-                tok_annots = {key: values[j] for key, values in tok_annots.items()}
-                label = self.make_label(j, tok_annots)
-                if label is None or label not in self.labels:
-                    correct[idx] = guesses[idx]
-                else:
-                    correct[idx] = self.labels[label]
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        loss = (d_scores**2).sum()
-        return float(loss), d_scores
-
-    @staticmethod
-    def make_dep(token):
-        return token.dep_
-
-    @staticmethod
-    def make_tag(token):
-        return token.tag_
-
-    @staticmethod
-    def make_ent(token):
-        if token.ent_iob_ == "O":
-            return "O"
-        else:
-            return token.ent_iob_ + "-" + token.ent_type_
-
-    @staticmethod
-    def make_dep_tag_offset(token):
-        dep = token.dep_
-        tag = token.tag_
-        offset = token.head.i - token.i
-        offset = min(offset, 2)
-        offset = max(offset, -2)
-        return f"{dep}-{tag}:{offset}"
-
-    @staticmethod
-    def make_ent_tag(token):
-        if token.ent_iob_ == "O":
-            ent = "O"
-        else:
-            ent = token.ent_iob_ + "-" + token.ent_type_
-        tag = token.tag_
-        return f"{tag}-{ent}"
-
-    @staticmethod
-    def make_sent_start(token):
-        """A multi-task objective for representing sentence boundaries,
-        using BILU scheme. (O is impossible)
-        """
-        if token.is_sent_start and token.is_sent_end:
-            return "U-SENT"
-        elif token.is_sent_start:
-            return "B-SENT"
-        else:
-            return "I-SENT"
-
-
-class ClozeMultitask(TrainablePipe):
-    def __init__(self, vocab, model, **cfg):
-        self.vocab = vocab
-        self.model = model
-        self.cfg = cfg
-        self.distance = CosineDistance(ignore_zeros=True, normalize=False)  # TODO: in config
-
-    def set_annotations(self, docs, dep_ids):
-        pass
-
-    def initialize(self, get_examples, nlp=None):
-        self.model.initialize()  # TODO: fix initialization by defining X and Y
-        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
-        self.model.output_layer.initialize(X)
-
-    def predict(self, docs):
-        tokvecs = self.model.get_ref("tok2vec")(docs)
-        vectors = self.model.get_ref("output_layer")(tokvecs)
-        return tokvecs, vectors
-
-    def get_loss(self, examples, vectors, prediction):
-        validate_examples(examples, "ClozeMultitask.get_loss")
-        # The simplest way to implement this would be to vstack the
-        # token.vector values, but that's a bit inefficient, especially on GPU.
-        # Instead we fetch the index into the vectors table for each of our tokens,
-        # and look them up all at once. This prevents data copying.
-        ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
-        target = vectors[ids]
-        gradient = self.distance.get_grad(prediction, target)
-        loss = self.distance.get_loss(prediction, target)
-        return float(loss), gradient
-
-    def update(self, examples, *, drop=0., sgd=None, losses=None):
-        pass
-
-    def rehearse(self, examples, drop=0., sgd=None, losses=None):
-        if losses is not None and self.name not in losses:
-            losses[self.name] = 0.
-        set_dropout_rate(self.model, drop)
-        validate_examples(examples, "ClozeMultitask.rehearse")
-        predictions, bp_predictions = self.model.begin_update()
-        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
-        bp_predictions(d_predictions)
-        if sgd is not None:
-            self.finish_update(sgd)
-        if losses is not None:
-            losses[self.name] += loss
-        return losses
-
-    def add_label(self, label):
-        raise NotImplementedError

From 87f65a85846b7055e35ce54d08939010d66bff29 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 12 Dec 2022 08:55:53 +0100
Subject: [PATCH 027/504] Custom extensions for spans with equal boundaries
 (#11429)

* Init

* Fix return type for mypy

* adjust types and improve setting new attributes

* Add underscore changes to json conversion

* Add test and underscore changes to from_docs

* add underscore changes and test to span.to_doc

* update return values

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Add types to function

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* adjust formatting

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* shorten return type

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* add helper function to improve readability

* Improve code and add comments

* rerun azure tests

* Fix tests for json conversion

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tests/doc/test_underscore.py | 119 +++++++++++++++++++++++++++++
 spacy/tokens/doc.pyx               |  30 ++++++--
 spacy/tokens/span.pyx              |  38 +++++++--
 spacy/tokens/underscore.py         |  44 ++++++++++-
 4 files changed, 214 insertions(+), 17 deletions(-)

diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index b79d2f01f41..ca5c2ad3959 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -4,6 +4,10 @@
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
+# Helper functions
+def _get_tuple(s: Span):
+    return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
+
 
 @pytest.fixture(scope="function", autouse=True)
 def clean_underscore():
@@ -172,3 +176,118 @@ def test_method(doc, arg1=1, arg2=2):
     doc = Doc(en_vocab, words=["hello", "world"])
     assert test_method.__doc__ == "I am a docstring"
     assert doc._.test_docstrings.__doc__.rsplit(". ")[-1] == "I am a docstring"
+
+
+def test_underscore_for_unique_span(en_tokenizer):
+    """Test that spans with the same boundaries but with different labels are uniquely identified (see #9706)."""
+    Doc.set_extension(name="doc_extension", default=None)
+    Span.set_extension(name="span_extension", default=None)
+    Token.set_extension(name="token_extension", default=None)
+
+    # Initialize doc
+    text = "Hello, world!"
+    doc = en_tokenizer(text)
+    span_1 = Span(doc, 0, 2, "SPAN_1")
+    span_2 = Span(doc, 0, 2, "SPAN_2")
+
+    # Set custom extensions
+    doc._.doc_extension = "doc extension"
+    doc[0]._.token_extension = "token extension"
+    span_1._.span_extension = "span_1 extension"
+    span_2._.span_extension = "span_2 extension"
+
+    # Assert extensions
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
+
+    # Change label of span and assert extensions
+    span_1.label_ = "NEW_LABEL"
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
+
+    # Change KB_ID and assert extensions
+    span_1.kb_id_ = "KB_ID"
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
+
+    # Change extensions and assert
+    span_2._.span_extension = "updated span_2 extension"
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
+
+    # Change span ID and assert extensions
+    span_2.id = 2
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
+
+    # Assert extensions with original key
+    assert doc.user_data[("._.", "doc_extension", None, None)] == "doc extension"
+    assert doc.user_data[("._.", "token_extension", 0, None)] == "token extension"
+
+
+def test_underscore_for_unique_span_from_docs(en_tokenizer):
+    """Test that spans in the user_data keep the same data structure when using Doc.from_docs"""
+    Span.set_extension(name="span_extension", default=None)
+    Token.set_extension(name="token_extension", default=None)
+
+    # Initialize doc
+    text_1 = "Hello, world!"
+    doc_1 = en_tokenizer(text_1)
+    span_1a = Span(doc_1, 0, 2, "SPAN_1a")
+    span_1b = Span(doc_1, 0, 2, "SPAN_1b")
+
+    text_2 = "This is a test."
+    doc_2 = en_tokenizer(text_2)
+    span_2a = Span(doc_2, 0, 3, "SPAN_2a")
+
+    # Set custom extensions
+    doc_1[0]._.token_extension = "token_1"
+    doc_2[1]._.token_extension = "token_2"
+    span_1a._.span_extension = "span_1a extension"
+    span_1b._.span_extension = "span_1b extension"
+    span_2a._.span_extension = "span_2a extension"
+
+    doc = Doc.from_docs([doc_1, doc_2])
+    # Assert extensions
+    assert doc_1.user_data[_get_tuple(span_1a)] == "span_1a extension"
+    assert doc_1.user_data[_get_tuple(span_1b)] == "span_1b extension"
+    assert doc_2.user_data[_get_tuple(span_2a)] == "span_2a extension"
+
+    # Check extensions on merged doc
+    assert doc.user_data[_get_tuple(span_1a)] == "span_1a extension"
+    assert doc.user_data[_get_tuple(span_1b)] == "span_1b extension"
+    assert (
+        doc.user_data[
+            (
+                "._.",
+                "span_extension",
+                span_2a.start_char + len(doc_1.text) + 1,
+                span_2a.end_char + len(doc_1.text) + 1,
+                span_2a.label,
+                span_2a.kb_id,
+                span_2a.id,
+            )
+        ]
+        == "span_2a extension"
+    )
+
+
+def test_underscore_for_unique_span_as_span(en_tokenizer):
+    """Test that spans in the user_data keep the same data structure when using Span.as_doc"""
+    Span.set_extension(name="span_extension", default=None)
+
+    # Initialize doc
+    text = "Hello, world!"
+    doc = en_tokenizer(text)
+    span_1 = Span(doc, 0, 2, "SPAN_1")
+    span_2 = Span(doc, 0, 2, "SPAN_2")
+
+    # Set custom extensions
+    span_1._.span_extension = "span_1 extension"
+    span_2._.span_extension = "span_2 extension"
+
+    span_doc = span_1.as_doc(copy_user_data=True)
+
+    # Assert extensions
+    assert span_doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert span_doc.user_data[_get_tuple(span_2)] == "span_2 extension"
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 48def8c9544..09dc94297f0 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1222,13 +1222,22 @@ cdef class Doc:
 
             if "user_data" not in exclude:
                 for key, value in doc.user_data.items():
-                    if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.":
-                        data_type, name, start, end = key
+                    if isinstance(key, tuple) and len(key) >= 4 and key[0] == "._.":
+                        data_type = key[0]
+                        name = key[1]
+                        start = key[2]
+                        end = key[3]
                         if start is not None or end is not None:
                             start += char_offset
                             if end is not None:
                                 end += char_offset
-                            concat_user_data[(data_type, name, start, end)] = copy.copy(value)
+                                _label = key[4]
+                                _kb_id = key[5]
+                                _span_id = key[6]
+                                concat_user_data[(data_type, name, start, end, _label, _kb_id, _span_id)] = copy.copy(value)
+                            else:
+                                concat_user_data[(data_type, name, start, end)] = copy.copy(value)
+
                         else:
                             warnings.warn(Warnings.W101.format(name=name))
                     else:
@@ -1672,7 +1681,11 @@ cdef class Doc:
                 Span.set_extension(span_attr)
             for span_data in doc_json["underscore_span"][span_attr]:
                 value = span_data["value"]
-                self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value)
+                span = self.char_span(span_data["start"], span_data["end"])
+                span.label = span_data["label"]
+                span.kb_id = span_data["kb_id"]
+                span.id = span_data["id"]
+                span._.set(span_attr, value)
         return self
 
     def to_json(self, underscore=None):
@@ -1750,13 +1763,16 @@ cdef class Doc:
                                 if attr not in data["underscore_token"]:
                                     data["underscore_token"][attr] = []
                                 data["underscore_token"][attr].append({"start": start, "value": value})
-                            # Span attribute
-                            elif start is not None and end is not None:
+                            # Else span attribute
+                            elif end is not None:
+                                _label = data_key[4]
+                                _kb_id = data_key[5]
+                                _span_id = data_key[6]
                                 if "underscore_span" not in data:
                                     data["underscore_span"] = {}
                                 if attr not in data["underscore_span"]:
                                     data["underscore_span"][attr] = []
-                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value})
+                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id":_span_id})
 
             for attr in underscore:
                 if attr not in user_keys:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 73f555747eb..bf37f955d98 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -230,11 +230,10 @@ cdef class Span:
         cdef SpanC* span_c = self.span_c()
         """Custom extension attributes registered via `set_extension`."""
         return Underscore(Underscore.span_extensions, self,
-                          start=span_c.start_char, end=span_c.end_char)
+                          start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
 
     def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None):
         """Create a `Doc` object with a copy of the `Span`'s data.
-
         copy_user_data (bool): Whether or not to copy the original doc's user data.
         array_head (tuple): `Doc` array attrs, can be passed in to speed up computation.
         array (ndarray): `Doc` as array, can be passed in to speed up computation.
@@ -287,12 +286,22 @@ cdef class Span:
             char_offset = self.start_char
             for key, value in self.doc.user_data.items():
                 if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.":
-                    data_type, name, start, end = key
+                    data_type = key[0]
+                    name = key[1]
+                    start = key[2]
+                    end = key[3]
                     if start is not None or end is not None:
                         start -= char_offset
+                        # Check if Span object
                         if end is not None:
                             end -= char_offset
-                        user_data[(data_type, name, start, end)] = copy.copy(value)
+                            _label = key[4]
+                            _kb_id = key[5]
+                            _span_id = key[6]
+                            user_data[(data_type, name, start, end, _label, _kb_id, _span_id)] = copy.copy(value)
+                        # Else Token object
+                        else:
+                            user_data[(data_type, name, start, end)] = copy.copy(value)
                 else:
                     user_data[key] = copy.copy(value)
             doc.user_data = user_data
@@ -815,21 +824,36 @@ cdef class Span:
             return self.span_c().label
 
         def __set__(self, attr_t label):
-            self.span_c().label = label
+            if label != self.span_c().label :
+                old_label = self.span_c().label
+                self.span_c().label = label
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id)
+                Underscore._replace_keys(old, new)
 
     property kb_id:
         def __get__(self):
             return self.span_c().kb_id
 
         def __set__(self, attr_t kb_id):
-            self.span_c().kb_id = kb_id
+            if kb_id != self.span_c().kb_id :
+                old_kb_id = self.span_c().kb_id
+                self.span_c().kb_id = kb_id
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id)
+                Underscore._replace_keys(old, new)
 
     property id:
         def __get__(self):
             return self.span_c().id
 
         def __set__(self, attr_t id):
-            self.span_c().id = id
+            if id != self.span_c().id :
+                old_id = self.span_c().id
+                self.span_c().id = id
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id)
+                Underscore._replace_keys(old, new)
 
     property ent_id:
         """Alias for the span's ID."""
diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index 0aa0c1e6d40..63706851286 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -3,10 +3,10 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 from ..errors import Errors
+from .span import Span
 
 if TYPE_CHECKING:
     from .doc import Doc
-    from .span import Span
     from .token import Token
 
 
@@ -26,6 +26,9 @@ def __init__(
         obj: Union["Doc", "Span", "Token"],
         start: Optional[int] = None,
         end: Optional[int] = None,
+        label: int = 0,
+        kb_id: int = 0,
+        span_id: int = 0,
     ):
         object.__setattr__(self, "_extensions", extensions)
         object.__setattr__(self, "_obj", obj)
@@ -37,6 +40,10 @@ def __init__(
         object.__setattr__(self, "_doc", obj.doc)
         object.__setattr__(self, "_start", start)
         object.__setattr__(self, "_end", end)
+        if type(obj) == Span:
+            object.__setattr__(self, "_label", label)
+            object.__setattr__(self, "_kb_id", kb_id)
+            object.__setattr__(self, "_span_id", span_id)
 
     def __dir__(self) -> List[str]:
         # Hack to enable autocomplete on custom extensions
@@ -89,8 +96,39 @@ def get(self, name: str) -> Any:
     def has(self, name: str) -> bool:
         return name in self._extensions
 
-    def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]:
-        return ("._.", name, self._start, self._end)
+    def _get_key(
+        self, name: str
+    ) -> Union[
+        Tuple[str, str, Optional[int], Optional[int]],
+        Tuple[str, str, Optional[int], Optional[int], int, int, int],
+    ]:
+        if hasattr(self, "_label"):
+            return (
+                "._.",
+                name,
+                self._start,
+                self._end,
+                self._label,
+                self._kb_id,
+                self._span_id,
+            )
+        else:
+            return "._.", name, self._start, self._end
+
+    @staticmethod
+    def _replace_keys(old_underscore: "Underscore", new_underscore: "Underscore"):
+        """
+        This function is called by Span when its kb_id or label are re-assigned.
+        It checks if any user_data is stored for this span and replaces the keys
+        """
+        for name in old_underscore._extensions:
+            old_key = old_underscore._get_key(name)
+            old_doc = old_underscore._doc
+            new_key = new_underscore._get_key(name)
+            if old_key != new_key and old_key in old_doc.user_data:
+                old_underscore._doc.user_data[
+                    new_key
+                ] = old_underscore._doc.user_data.pop(old_key)
 
     @classmethod
     def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]:

From db9886ed695a51a08a4130c491865b69673abe2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Sat, 17 Dec 2022 14:32:19 +0100
Subject: [PATCH 028/504] Fix v4 branch to build against Thinc v9 (#11921)

* Move `thinc.extra.search` to `spacy.pipeline._parser_internals`

Backport of:
https://github.com/explosion/spaCy/pull/11317

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Replace references to `thinc.backends.linalg` with `CBlas`

Backport of:
https://github.com/explosion/spaCy/pull/11292

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Use cross entropy from `thinc.legacy`

* Require thinc>=9.0.0.dev0,<9.1.0

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 pyproject.toml                                |   5 +-
 requirements.txt                              |   2 +-
 setup.cfg                                     |   4 +-
 setup.py                                      |   2 +
 spacy/ml/parser_model.pyx                     |  26 +-
 .../_parser_internals/_beam_utils.pxd         |   3 +-
 .../_parser_internals/_beam_utils.pyx         |  12 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 +-
 spacy/pipeline/_parser_internals/ner.pyx      |   4 +-
 spacy/pipeline/_parser_internals/search.pxd   |  89 +++++
 spacy/pipeline/_parser_internals/search.pyx   | 306 ++++++++++++++++++
 spacy/pipeline/edit_tree_lemmatizer.py        |   7 +-
 spacy/pipeline/morphologizer.pyx              |   5 +-
 spacy/pipeline/senter.pyx                     |   6 +-
 spacy/pipeline/tagger.pyx                     |   7 +-
 spacy/pipeline/transition_parser.pyx          |  21 +-
 spacy/tests/conftest.py                       |  32 ++
 spacy/tests/parser/_search.pyx                | 119 +++++++
 spacy/tests/parser/test_search.py             |   3 +
 19 files changed, 606 insertions(+), 50 deletions(-)
 create mode 100644 spacy/pipeline/_parser_internals/search.pxd
 create mode 100644 spacy/pipeline/_parser_internals/search.pyx
 create mode 100644 spacy/tests/parser/_search.pyx
 create mode 100644 spacy/tests/parser/test_search.py

diff --git a/pyproject.toml b/pyproject.toml
index bfd7e68d1f7..77e1471426f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,9 +5,8 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.2.2,<8.3.0",
-    "numpy>=1.15.0; python_version < '3.9'",
-    "numpy>=1.25.0; python_version >= '3.9'",
+    "thinc>=9.0.0.dev0,<9.1.0",
+    "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index 036867ddc4b..6167d37f900 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.2.2,<8.3.0
+thinc>=9.0.0.dev0,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 1dbf8f56454..05cc991eb68 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,8 +38,8 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.2.2,<8.3.0
-    wasabi>=0.9.1,<1.2.0
+    thinc>=9.0.0.dev0,<9.1.0
+    wasabi>=0.9.1,<1.1.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
     weasel>=0.1.0,<0.4.0
diff --git a/setup.py b/setup.py
index a80016ea9ea..0eb529c2098 100755
--- a/setup.py
+++ b/setup.py
@@ -47,6 +47,7 @@
     "spacy.pipeline._parser_internals.arc_eager",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
+    "spacy.pipeline._parser_internals.search",
     "spacy.pipeline._parser_internals._state",
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
@@ -66,6 +67,7 @@
     "spacy.matcher.dependencymatcher",
     "spacy.symbols",
     "spacy.vectors",
+    "spacy.tests.parser._search",
 ]
 COMPILE_OPTIONS = {
     "msvc": ["/Ox", "/EHsc"],
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index cb323e98891..10a9f0bc485 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -3,7 +3,6 @@
 cimport numpy as np
 from libc.math cimport exp
 from libc.stdlib cimport calloc, free, realloc
-from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport saxpy, sgemm
 from thinc.backends.linalg cimport Vec, VecVec
 
@@ -116,14 +115,10 @@ cdef void predict_states(
         n.hiddens * n.pieces
     )
     for i in range(n.states):
-        VecVec.add_i(
-            &A.unmaxed[i*n.hiddens*n.pieces],
-            W.feat_bias, 1.,
-            n.hiddens * n.pieces
-        )
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
         for j in range(n.hiddens):
             index = i * n.hiddens * n.pieces + j * n.pieces
-            which = Vec.arg_max(&A.unmaxed[index], n.pieces)
+            which = _arg_max(&A.unmaxed[index], n.pieces)
             A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
     memset(A.scores, 0, n.states * n.classes * sizeof(float))
     if W.hidden_weights == NULL:
@@ -138,7 +133,7 @@ cdef void predict_states(
         )
         # Add bias
         for i in range(n.states):
-            VecVec.add_i(&A.scores[i*n.classes], W.hidden_bias, 1., n.classes)
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
     # Set unseen classes to minimum value
     i = 0
     min_ = A.scores[0]
@@ -187,7 +182,8 @@ cdef void cpu_log_loss(
     """Do multi-label log loss"""
     cdef double max_, gmax, Z, gZ
     best = arg_max_if_gold(scores, costs, is_valid, O)
-    guess = Vec.arg_max(scores, O)
+    guess = _arg_max(scores, O)
+
     if best == -1 or guess == -1:
         # These shouldn't happen, but if they do, we want to make sure we don't
         # cause an OOB access.
@@ -529,3 +525,15 @@ cdef class precompute_hiddens:
             return d_best.reshape((d_best.shape + (1,)))
 
         return state_vector, backprop_relu
+
+cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pxd b/spacy/pipeline/_parser_internals/_beam_utils.pxd
index 596306b2319..571f246b1e3 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd
@@ -1,7 +1,6 @@
 from ...typedefs cimport class_t, hash_t
 
-
-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
 
 cdef int check_final_state(void* _state, void* extra_args) except -1
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index ac04be5a719..d004d313c3e 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -1,21 +1,17 @@
 # cython: infer_types=True
 import numpy
-
-from thinc.extra.search cimport Beam
-
-from thinc.extra.search import MaxViolation
-
-from thinc.extra.search cimport MaxViolation
+from cpython.ref cimport PyObject, Py_XDECREF
 
 from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
-
+from .search cimport Beam, MaxViolation
+from .search import MaxViolation
 from .stateclass cimport StateC, StateClass
 
 
-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
     dest = <StateC*>_dest
     src = <StateC*>_src
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index e1375494482..10f2649baa0 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -18,8 +18,7 @@ from ._state cimport ArcC, StateC
 from .stateclass cimport StateClass
 
 from ...errors import Errors
-
-from thinc.extra.search cimport Beam
+from .search cimport Beam
 
 
 cdef weight_t MIN_SCORE = -90000
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index c77b7b50f2d..6851f9f2096 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -8,8 +8,6 @@ from libc.stdint cimport int32_t
 
 from collections import Counter
 
-from thinc.extra.search cimport Beam
-
 from ...tokens.doc cimport Doc
 
 from ...tokens.span import Span
@@ -23,6 +21,8 @@ from ...typedefs cimport attr_t, weight_t
 from ...training import split_bilu_label
 
 from ...training.example cimport Example
+from .search cimport Beam
+from .stateclass cimport StateClass
 from ._state cimport StateC
 from .stateclass cimport StateClass
 from .transition_system cimport Transition, do_func_t
diff --git a/spacy/pipeline/_parser_internals/search.pxd b/spacy/pipeline/_parser_internals/search.pxd
new file mode 100644
index 00000000000..dfe30e1c130
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/search.pxd
@@ -0,0 +1,89 @@
+from cymem.cymem cimport Pool
+
+from libc.stdint cimport uint32_t
+from libc.stdint cimport uint64_t
+from libcpp.pair cimport pair
+from libcpp.queue cimport priority_queue
+from libcpp.vector cimport vector
+
+from ...typedefs cimport class_t, weight_t, hash_t
+
+ctypedef pair[weight_t, size_t] Entry
+ctypedef priority_queue[Entry] Queue
+
+
+ctypedef int (*trans_func_t)(void* dest, void* src, class_t clas, void* x) except -1
+
+ctypedef void* (*init_func_t)(Pool mem, int n, void* extra_args) except NULL
+
+ctypedef int (*del_func_t)(Pool mem, void* state, void* extra_args) except -1
+
+ctypedef int (*finish_func_t)(void* state, void* extra_args) except -1
+
+ctypedef hash_t (*hash_func_t)(void* state, void* x) except 0
+
+
+cdef struct _State:
+    void* content
+    class_t* hist
+    weight_t score
+    weight_t loss
+    int i
+    int t
+    bint is_done
+
+
+cdef class Beam:
+    cdef Pool mem
+    cdef class_t nr_class
+    cdef class_t width
+    cdef class_t size
+    cdef public weight_t min_density
+    cdef int t
+    cdef readonly bint is_done
+    cdef list histories
+    cdef list _parent_histories
+    cdef weight_t** scores
+    cdef int** is_valid
+    cdef weight_t** costs
+    cdef _State* _parents
+    cdef _State* _states
+    cdef del_func_t del_func
+
+    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1
+
+    cdef inline void* at(self, int i) nogil:
+        return self._states[i].content
+
+    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1
+    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
+                     void* extra_args) except -1
+    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
+ 
+
+    cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
+        self.scores[i][j] = score
+        self.is_valid[i][j] = is_valid
+        self.costs[i][j] = cost
+
+    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
+                     const weight_t* costs) except -1
+    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1
+
+
+cdef class MaxViolation:
+    cdef Pool mem
+    cdef weight_t cost
+    cdef weight_t delta
+    cdef readonly weight_t p_score
+    cdef readonly weight_t g_score
+    cdef readonly double Z
+    cdef readonly double gZ
+    cdef class_t n
+    cdef readonly list p_hist
+    cdef readonly list g_hist
+    cdef readonly list p_probs
+    cdef readonly list g_probs
+
+    cpdef int check(self, Beam pred, Beam gold) except -1
+    cpdef int check_crf(self, Beam pred, Beam gold) except -1
diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
new file mode 100644
index 00000000000..1d9b6dd7adf
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -0,0 +1,306 @@
+# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
+cimport cython
+from libc.string cimport memset, memcpy
+from libc.math cimport log, exp
+import math
+
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+
+
+cdef class Beam:
+    def __init__(self, class_t nr_class, class_t width, weight_t min_density=0.0):
+        assert nr_class != 0
+        assert width != 0
+        self.nr_class = nr_class
+        self.width = width
+        self.min_density = min_density
+        self.size = 1
+        self.t = 0
+        self.mem = Pool()
+        self.del_func = NULL
+        self._parents = <_State*>self.mem.alloc(self.width, sizeof(_State))
+        self._states = <_State*>self.mem.alloc(self.width, sizeof(_State))
+        cdef int i
+        self.histories = [[] for i in range(self.width)]
+        self._parent_histories = [[] for i in range(self.width)]
+
+        self.scores = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
+        self.is_valid = <int**>self.mem.alloc(self.width, sizeof(weight_t*))
+        self.costs = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
+        for i in range(self.width):
+            self.scores[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
+            self.is_valid[i] = <int*>self.mem.alloc(self.nr_class, sizeof(int))
+            self.costs[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
+
+    def __len__(self):
+        return self.size
+
+    property score:
+        def __get__(self):
+            return self._states[0].score
+
+    property min_score:
+        def __get__(self):
+            return self._states[self.size-1].score
+
+    property loss:
+        def __get__(self):
+            return self._states[0].loss
+
+    property probs:
+        def __get__(self):
+            return _softmax([self._states[i].score for i in range(self.size)])
+
+    property scores:
+        def __get__(self):
+            return [self._states[i].score for i in range(self.size)]
+
+    property histories:
+        def __get__(self):
+            return self.histories
+
+    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
+                     const weight_t* costs) except -1:
+        cdef int j
+        for j in range(self.nr_class):
+            self.scores[i][j] = scores[j]
+            self.is_valid[i][j] = is_valid[j]
+            self.costs[i][j] = costs[j]
+
+    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
+        cdef int i, j
+        for i in range(self.width):
+            memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
+            memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
+            memcpy(self.costs[i], costs[i], sizeof(int) * self.nr_class)
+
+    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1:
+        for i in range(self.width):
+            self._states[i].content = init_func(self.mem, n, extra_args)
+            self._parents[i].content = init_func(self.mem, n, extra_args)
+        self.del_func = del_func
+
+    def __dealloc__(self):
+        if self.del_func == NULL:
+            return
+
+        for i in range(self.width):
+            self.del_func(self.mem, self._states[i].content, NULL)
+            self.del_func(self.mem, self._parents[i].content, NULL)
+
+    @cython.cdivision(True)
+    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
+                     void* extra_args) except -1:
+        cdef weight_t** scores = self.scores
+        cdef int** is_valid = self.is_valid
+        cdef weight_t** costs = self.costs
+
+        cdef Queue* q = new Queue()
+        self._fill(q, scores, is_valid)
+        # For a beam of width k, we only ever need 2k state objects. How?
+        # Each transition takes a parent and a class and produces a new state.
+        # So, we don't need the whole history --- just the parent. So at
+        # each step, we take a parent, and apply one or more extensions to
+        # it.
+        self._parents, self._states = self._states, self._parents
+        self._parent_histories, self.histories = self.histories, self._parent_histories
+        cdef weight_t score
+        cdef int p_i
+        cdef int i = 0
+        cdef class_t clas
+        cdef _State* parent
+        cdef _State* state
+        cdef hash_t key
+        cdef PreshMap seen_states = PreshMap(self.width)
+        cdef uint64_t is_seen
+        cdef uint64_t one = 1
+        while i < self.width and not q.empty():
+            data = q.top()
+            p_i = data.second / self.nr_class
+            clas = data.second % self.nr_class
+            score = data.first
+            q.pop()
+            parent = &self._parents[p_i]
+            # Indicates terminal state reached; i.e. state is done
+            if parent.is_done:
+                # Now parent will not be changed, so we don't have to copy.
+                # Once finished, should also be unbranching.
+                self._states[i], parent[0] = parent[0], self._states[i]
+                parent.i = self._states[i].i
+                parent.t = self._states[i].t
+                parent.is_done = self._states[i].t
+                self._states[i].score = score
+                self.histories[i] = list(self._parent_histories[p_i])
+                i += 1
+            else:
+                state = &self._states[i]
+                # The supplied transition function should adjust the destination
+                # state to be the result of applying the class to the source state
+                transition_func(state.content, parent.content, clas, extra_args)
+                key = hash_func(state.content, extra_args) if hash_func is not NULL else 0
+                is_seen = <uint64_t>seen_states.get(key)
+                if key == 0 or key == 1 or not is_seen:
+                    if key != 0 and key != 1:
+                        seen_states.set(key, <void*>one)
+                    state.score = score
+                    state.loss = parent.loss + costs[p_i][clas]
+                    self.histories[i] = list(self._parent_histories[p_i])
+                    self.histories[i].append(clas)
+                    i += 1
+        del q
+        self.size = i
+        assert self.size >= 1
+        for i in range(self.width):
+            memset(self.scores[i], 0, sizeof(weight_t) * self.nr_class)
+            memset(self.costs[i], 0, sizeof(weight_t) * self.nr_class)
+            memset(self.is_valid[i], 0, sizeof(int) * self.nr_class)
+        self.t += 1
+
+    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1:
+        cdef int i
+        for i in range(self.size):
+            if not self._states[i].is_done:
+                self._states[i].is_done = finish_func(self._states[i].content, extra_args)
+        for i in range(self.size):
+            if not self._states[i].is_done:
+                self.is_done = False
+                break
+        else:
+            self.is_done = True
+
+    @cython.cdivision(True)
+    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1:
+        """Populate the queue from a k * n matrix of scores, where k is the
+        beam-width, and n is the number of classes.
+        """
+        cdef Entry entry
+        cdef weight_t score
+        cdef _State* s
+        cdef int i, j, move_id
+        assert self.size >= 1
+        cdef vector[Entry] entries
+        for i in range(self.size):
+            s = &self._states[i]
+            move_id = i * self.nr_class
+            if s.is_done:
+                # Update score by path average, following TACL '13 paper.
+                if self.histories[i]:
+                    entry.first = s.score + (s.score / self.t)
+                else:
+                    entry.first = s.score
+                entry.second = move_id
+                entries.push_back(entry)
+            else:
+                for j in range(self.nr_class):
+                    if is_valid[i][j]:
+                        entry.first = s.score + scores[i][j]
+                        entry.second = move_id + j
+                        entries.push_back(entry)
+        cdef double max_, Z, cutoff
+        if self.min_density == 0.0:
+            for i in range(entries.size()):
+                q.push(entries[i])
+        elif not entries.empty():
+            max_ = entries[0].first
+            Z = 0.
+            cutoff = 0.
+            # Softmax into probabilities, so we can prune
+            for i in range(entries.size()):
+                if entries[i].first > max_:
+                    max_ = entries[i].first
+            for i in range(entries.size()):
+                Z += exp(entries[i].first-max_)
+            cutoff = (1. / Z) * self.min_density
+            for i in range(entries.size()):
+                prob = exp(entries[i].first-max_) / Z
+                if prob >= cutoff:
+                    q.push(entries[i])
+
+
+cdef class MaxViolation:
+    def __init__(self):
+        self.p_score = 0.0
+        self.g_score = 0.0
+        self.Z = 0.0
+        self.gZ = 0.0
+        self.delta = -1
+        self.cost = 0
+        self.p_hist = []
+        self.g_hist = []
+        self.p_probs = []
+        self.g_probs = []
+
+    cpdef int check(self, Beam pred, Beam gold) except -1:
+        cdef _State* p = &pred._states[0]
+        cdef _State* g = &gold._states[0]
+        cdef weight_t d = p.score - g.score
+        if p.loss >= 1 and (self.cost == 0 or d > self.delta):
+            self.cost = p.loss
+            self.delta = d
+            self.p_hist = list(pred.histories[0])
+            self.g_hist = list(gold.histories[0])
+            self.p_score = p.score
+            self.g_score = g.score
+            self.Z = 1e-10
+            self.gZ = 1e-10
+            for i in range(pred.size):
+                if pred._states[i].loss > 0:
+                    self.Z += exp(pred._states[i].score)
+            for i in range(gold.size):
+                if gold._states[i].loss == 0:
+                    prob = exp(gold._states[i].score)
+                    self.Z += prob
+                    self.gZ += prob
+
+    cpdef int check_crf(self, Beam pred, Beam gold) except -1:
+        d = pred.score - gold.score
+        seen_golds = set([tuple(gold.histories[i]) for i in range(gold.size)])
+        if pred.loss > 0 and (self.cost == 0 or d > self.delta):
+            p_hist = []
+            p_scores = []
+            g_hist = []
+            g_scores = []
+            for i in range(pred.size):
+                if pred._states[i].loss > 0:
+                    p_scores.append(pred._states[i].score)
+                    p_hist.append(list(pred.histories[i]))
+                # This can happen from non-monotonic actions
+                # If we find a better gold analysis this way, be sure to keep it.
+                elif pred._states[i].loss <= 0 \
+                and tuple(pred.histories[i]) not in seen_golds:
+                    g_scores.append(pred._states[i].score)
+                    g_hist.append(list(pred.histories[i]))
+            for i in range(gold.size):
+                if gold._states[i].loss == 0:
+                    g_scores.append(gold._states[i].score)
+                    g_hist.append(list(gold.histories[i]))
+
+            all_probs = _softmax(p_scores + g_scores)
+            p_probs = all_probs[:len(p_scores)]
+            g_probs_all = all_probs[len(p_scores):]
+            g_probs = _softmax(g_scores)
+
+            self.cost = pred.loss
+            self.delta = d
+            self.p_hist = p_hist
+            self.g_hist = g_hist
+            # TODO: These variables are misnamed! These are the gradients of the loss.
+            self.p_probs = p_probs
+            # Intuition here:
+            # The gradient of the loss is:
+            # P(model) - P(truth)
+            # Normally, P(truth) is 1 for the gold
+            # But, if we want to do the "partial credit" scheme, we want
+            # to create a distribution over the gold, proportional to the scores
+            # awarded.
+            self.g_probs = [x-y for x, y in zip(g_probs_all, g_probs)]
+
+
+def _softmax(nums):
+    if not nums:
+        return []
+    max_ = max(nums)
+    nums = [(exp(n-max_) if n is not None else None) for n in nums]
+    Z = sum(n for n in nums if n is not None)
+    return [(n/Z if n is not None else None) for n in nums]
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 2ef639cad52..f9a8ae10561 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -4,8 +4,9 @@
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+from thinc.api import Config, Model
 from thinc.types import ArrayXd, Floats2d, Ints1d
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 
 from .. import util
 from ..errors import Errors
@@ -131,7 +132,9 @@ def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
     ) -> Tuple[float, List[Floats2d]]:
         validate_examples(examples, "EditTreeLemmatizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
+        loss_func = LegacySequenceCategoricalCrossentropy(
+            normalize=False, missing_value=-1
+        )
 
         truths = []
         for eg in examples:
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index cc8f87936b9..d3068bdffdd 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,7 +1,8 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Callable, Dict, Iterable, List, Optional, Union
 import srsly
-from thinc.api import SequenceCategoricalCrossentropy, Model, Config
+from thinc.api import Model, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 from itertools import islice
 from typing import Callable, Dict, Optional, Union
@@ -302,7 +303,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 521afe1d181..185430c122c 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -4,7 +4,9 @@ from itertools import islice
 from typing import Callable, Optional
 
 import srsly
-from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.api import Model, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+
 from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
@@ -163,7 +165,7 @@ class SentenceRecognizer(Tagger):
         """
         validate_examples(examples, "SentenceRecognizer.get_loss")
         labels = self.labels
-        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=labels, normalize=False)
         truths = []
         for eg in examples:
             eg_truth = []
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 8ecd0c46ee0..f25ee00407b 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -2,7 +2,8 @@
 from typing import Callable, Dict, Iterable, List, Optional, Union
 import numpy
 import srsly
-from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.api import Model, set_dropout_rate, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 import warnings
 from itertools import islice
@@ -242,7 +243,7 @@ class Tagger(TrainablePipe):
 
         DOCS: https://spacy.io/api/tagger#rehearse
         """
-        loss_func = SequenceCategoricalCrossentropy()
+        loss_func = LegacySequenceCategoricalCrossentropy()
         if losses is None:
             losses = {}
         losses.setdefault(self.name, 0.0)
@@ -273,7 +274,7 @@ class Tagger(TrainablePipe):
         DOCS: https://spacy.io/api/tagger#get_loss
         """
         validate_examples(examples, "Tagger.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"], label_smoothing=self.cfg["label_smoothing"])
+        loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
         # Convert empty tag "" to missing value None so that both misaligned
         # tokens and tokens with missing annotation have the default missing
         # value None.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index b8ebbf8ca88..d310df92151 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -13,23 +13,20 @@ from libcpp.vector cimport vector
 
 import random
 
+import srsly
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
+import numpy.random
 import numpy
 import numpy.random
 import srsly
 from thinc.api import CupyOps, NumpyOps, set_dropout_rate
 
-from ..ml.parser_model cimport (
-    ActivationsC,
-    SizesC,
-    WeightsC,
-    alloc_activations,
-    arg_max_if_valid,
-    cpu_log_loss,
-    free_activations,
-    get_c_sizes,
-    get_c_weights,
-    predict_states,
-)
+from ._parser_internals.stateclass cimport StateClass
+from ._parser_internals.search cimport Beam
+from ..ml.parser_model cimport alloc_activations, free_activations
+from ..ml.parser_model cimport predict_states, arg_max_if_valid
+from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
+from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
 from ._parser_internals.stateclass cimport StateClass
 
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 2a9f441c9b0..6085b89cf02 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -1,5 +1,10 @@
 import pytest
+from spacy.util import get_lang_class
+import functools
 from hypothesis import settings
+import inspect
+import importlib
+import sys
 
 from spacy.util import get_lang_class
 
@@ -48,6 +53,33 @@ def getopt(opt):
             pytest.skip("not referencing any issues")
 
 
+# Decorator for Cython-built tests
+# https://shwina.github.io/cython-testing/
+def cytest(func):
+    """
+    Wraps `func` in a plain Python function.
+    """
+
+    @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+        bound = inspect.signature(func).bind(*args, **kwargs)
+        return func(*bound.args, **bound.kwargs)
+
+    return wrapped
+
+
+def register_cython_tests(cython_mod_name: str, test_mod_name: str):
+    """
+    Registers all callables with name `test_*` in Cython module `cython_mod_name`
+    as attributes in module `test_mod_name`, making them discoverable by pytest.
+    """
+    cython_mod = importlib.import_module(cython_mod_name)
+    for name in dir(cython_mod):
+        item = getattr(cython_mod, name)
+        if callable(item) and name.startswith("test_"):
+            setattr(sys.modules[test_mod_name], name, item)
+
+
 # Fixtures for language tokenizers (languages sorted alphabetically)
 
 
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
new file mode 100644
index 00000000000..23fc8164412
--- /dev/null
+++ b/spacy/tests/parser/_search.pyx
@@ -0,0 +1,119 @@
+# cython: infer_types=True, binding=True
+from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
+from spacy.typedefs cimport class_t, weight_t
+from cymem.cymem cimport Pool
+
+from ..conftest import cytest
+import pytest
+
+cdef struct TestState:
+    int length
+    int x
+    Py_UNICODE* string
+
+
+cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1:
+    dest_state = <TestState*>dest
+    src_state = <TestState*>src
+    dest_state.length = src_state.length
+    dest_state.x = src_state.x
+    dest_state.x += clas
+    if extra_args != NULL:
+        dest_state.string = <Py_UNICODE*>extra_args
+    else:
+        dest_state.string = src_state.string
+
+
+cdef void* initialize(Pool mem, int n, void* extra_args) except NULL:
+    state = <TestState*>mem.alloc(1, sizeof(TestState))
+    state.length = n
+    state.x = 1
+    if extra_args == NULL:
+        state.string = u'default'
+    else:
+        state.string = <Py_UNICODE*>extra_args
+    return state
+
+
+cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
+    state = <TestState*>state
+    mem.free(state)
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width",
+    [
+        (2, 3),
+        (3, 6),
+        (4, 20),
+    ]
+)
+def test_init(nr_class, beam_width):
+    b = Beam(nr_class, beam_width)
+    assert b.size == 1
+    assert b.width == beam_width
+    assert b.nr_class == nr_class
+
+@cytest
+def test_init_violn():
+    MaxViolation()
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length",
+    [
+        (2, 3, 3),
+        (3, 6, 15),
+        (4, 20, 32),
+    ]
+)
+def test_initialize(nr_class, beam_width, length):
+    b = Beam(nr_class, beam_width)
+    b.initialize(initialize, destroy, length, NULL)
+    for i in range(b.width):
+        s = <TestState*>b.at(i)
+        assert s.length == length, s.length
+        assert s.string == 'default'
+
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length,extra",
+    [
+        (2, 3, 4, None),
+        (3, 6, 15, u"test beam 1"),
+    ]
+)
+def test_initialize_extra(nr_class, beam_width, length, extra):
+    b = Beam(nr_class, beam_width)
+    if extra is None:
+        b.initialize(initialize, destroy, length, NULL)
+    else:
+        b.initialize(initialize, destroy, length, <void*><Py_UNICODE*>extra)
+    for i in range(b.width):
+        s = <TestState*>b.at(i)
+        assert s.length == length
+
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length",
+    [
+        (3, 6, 15),
+        (4, 20, 32),
+    ]
+)
+def test_transition(nr_class, beam_width, length):
+    b = Beam(nr_class, beam_width)
+    b.initialize(initialize, destroy, length, NULL)
+    b.set_cell(0, 2, 30, True, 0)
+    b.set_cell(0, 1, 42, False, 0)
+    b.advance(transition, NULL, NULL)
+    assert b.size == 1, b.size
+    assert b.score == 30, b.score
+    s = <TestState*>b.at(0)
+    assert s.x == 3
+    assert b._states[0].score == 30, b._states[0].score
+    b.set_cell(0, 1, 10, True, 0)
+    b.set_cell(0, 2, 20, True, 0)
+    b.advance(transition, NULL, NULL)
+    assert b._states[0].score == 50, b._states[0].score
+    assert b._states[1].score == 40
+    s = <TestState*>b.at(0)
+    assert s.x == 5
diff --git a/spacy/tests/parser/test_search.py b/spacy/tests/parser/test_search.py
new file mode 100644
index 00000000000..136c3a11b8a
--- /dev/null
+++ b/spacy/tests/parser/test_search.py
@@ -0,0 +1,3 @@
+from ..conftest import register_cython_tests
+
+register_cython_tests("spacy.tests.parser._search", __name__)

From e81b2df4116219c0c8b352da26781d88ed3639fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 22 Dec 2022 10:23:31 +0100
Subject: [PATCH 029/504] Fix fallout from a previous merge

---
 spacy/pipeline/textcat_multilabel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index ac024ba3639..9ed9770086c 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -163,6 +163,7 @@ def __init__(
         name (str): The component instance name, used to add entries to the
             losses during training.
         threshold (float): Cutoff to consider a prediction "positive".
+        scorer (Optional[Callable]): The scoring method.
         save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/textcategorizer#init

From 6ae4278cbe0f79e5b36bd256667bc6e3a3c364d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 29 Dec 2022 08:03:24 +0100
Subject: [PATCH 030/504] Adjust to new `Schedule` class and pass scores to
 `Optimizer` (#12008)

* Adjust to new `Schedule` class and pass scores to `Optimizer`

Requires https://github.com/explosion/thinc/pull/804

* Bump minimum Thinc requirement to 9.0.0.dev1
---
 pyproject.toml             |  2 +-
 requirements.txt           |  2 +-
 setup.cfg                  |  4 ++--
 spacy/training/batchers.py | 38 ++++++++++++++++----------------------
 spacy/training/loop.py     |  3 ++-
 spacy/util.py              | 13 +++++++++----
 6 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 77e1471426f..58e4382e829 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev0,<9.1.0",
+    "thinc>=9.0.0.dev1,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 6167d37f900..8224ab783f9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev0,<9.1.0
+thinc>=9.0.0.dev1,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 05cc991eb68..6c3edd83e73 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,8 +38,8 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev0,<9.1.0
-    wasabi>=0.9.1,<1.1.0
+    thinc>=9.0.0.dev1,<9.1.0
+    wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
     weasel>=0.1.0,<0.4.0
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index 050c3351b08..519e61315da 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,20 +1,9 @@
 import itertools
-from functools import partial
-from typing import (
-    Any,
-    Callable,
-    Iterable,
-    Iterator,
-    List,
-    Optional,
-    Sequence,
-    TypeVar,
-    Union,
-)
+from thinc.schedules import Schedule, constant as constant_schedule
 
 from ..util import minibatch, registry
 
-Sizing = Union[Sequence[int], int]
+Sizing = Union[Sequence[int], int, Schedule[int]]
 ItemT = TypeVar("ItemT")
 BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
 
@@ -119,12 +108,13 @@ def minibatch_by_padded_size(
         The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = itertools.repeat(size)  # type: Iterator[int]
+        size_ = constant_schedule(size)
     else:
-        size_ = iter(size)
-    for outer_batch in minibatch(seqs, size=buffer):
+        assert isinstance(size, Schedule)
+        size_ = size
+    for step, outer_batch in enumerate(minibatch(seqs, size=buffer)):
         outer_batch = list(outer_batch)
-        target_size = next(size_)
+        target_size = size_(step)
         for indices in _batch_by_length(outer_batch, target_size, get_length):
             subbatch = [outer_batch[i] for i in indices]
             padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
@@ -155,10 +145,12 @@ def minibatch_by_words(
         item. The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = itertools.repeat(size)  # type: Iterator[int]
+        size_ = constant_schedule(size)
     else:
-        size_ = iter(size)
-    target_size = next(size_)
+        assert isinstance(size, Schedule)
+        size_ = size
+    step = 0
+    target_size = size_(step)
     tol_size = target_size * tolerance
     batch = []
     overflow = []
@@ -183,7 +175,8 @@ def minibatch_by_words(
         else:
             if batch:
                 yield batch
-            target_size = next(size_)
+            step += 1
+            target_size = size_(step)
             tol_size = target_size * tolerance
             batch = overflow
             batch_size = overflow_size
@@ -201,7 +194,8 @@ def minibatch_by_words(
             else:
                 if batch:
                     yield batch
-                target_size = next(size_)
+                step += 1
+                target_size = size_(step)
                 tol_size = target_size * tolerance
                 batch = [seq]
                 batch_size = n_words
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 56df5395720..05c59fc9877 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -215,7 +215,7 @@ def train_while_improving(
         if before_update:
             before_update_args = {"step": step, "epoch": epoch}
             before_update(nlp, before_update_args)
-        dropout = next(dropouts)  # type: ignore
+        dropout = dropouts(optimizer.step)  # type: ignore
         for subbatch in subdivide_batch(batch, accumulate_gradient):
             nlp.update(
                 subbatch,
@@ -241,6 +241,7 @@ def train_while_improving(
                     score, other_scores = evaluate()
             else:
                 score, other_scores = evaluate()
+            optimizer.last_score = score
             results.append((score, step))
             is_best_checkpoint = score == max(results)[0]
         else:
diff --git a/spacy/util.py b/spacy/util.py
index 463ac219bf5..551f78cc969 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1,7 +1,12 @@
 import functools
 import importlib
 import importlib.util
-import inspect
+import re
+from pathlib import Path
+import thinc
+from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
+from thinc.api import ConfigValidationError, Model, constant as constant_schedule
+import functools
 import itertools
 import logging
 import os
@@ -1637,12 +1642,12 @@ def minibatch(items, size):
     so that batch-size can vary on each step.
     """
     if isinstance(size, int):
-        size_ = itertools.repeat(size)
+        size_ = constant_schedule(size)
     else:
         size_ = size
     items = iter(items)
-    while True:
-        batch_size = next(size_)
+    for step in itertools.count():
+        batch_size = size_(step)
         batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break

From 943006f501fccb95118c2a865b78ab7ad94d22cb Mon Sep 17 00:00:00 2001
From: Tetsuo Kiso <tetsuok@users.noreply.github.com>
Date: Wed, 4 Jan 2023 01:43:09 +0900
Subject: [PATCH 031/504] Delete unused imports for StringStore (#12040)

---
 spacy/lexeme.pxd    | 18 ++++--------------
 spacy/tokenizer.pxd |  4 ++++
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index ff2e4f92edf..2d14edcd6b0 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,20 +1,10 @@
 from numpy cimport ndarray
 
-from .attrs cimport (
-    ID,
-    LANG,
-    LENGTH,
-    LOWER,
-    NORM,
-    ORTH,
-    PREFIX,
-    SHAPE,
-    SUFFIX,
-    attr_id_t,
-)
-from .strings cimport StringStore
+from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
+from .attrs cimport attr_id_t
+from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
+
 from .structs cimport LexemeC
-from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
 from .vocab cimport Vocab
 
 
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index f64e0e93413..c963dcbcfa4 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -2,6 +2,10 @@ from cymem.cymem cimport Pool
 from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 
+from .typedefs cimport hash_t
+from .structs cimport LexemeC, SpanC, TokenC
+from .tokens.doc cimport Doc
+from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
 from .strings cimport StringStore
 from .structs cimport LexemeC, SpanC, TokenC

From 16c9303f30a3fc3daa393f4e5234a1f0b778e8bc Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Mon, 9 Jan 2023 20:15:02 +0100
Subject: [PATCH 032/504] Pass `step=0` to `Schedule` class to yield initial
 learning rate (#12078)

---
 spacy/training/loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 05c59fc9877..58d5b06786f 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -111,7 +111,7 @@ def save_checkpoint(is_best):
         stdout.write(
             msg.info(f"Set annotations on update for: {annotating_components}") + "\n"
         )
-    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
+    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate(step=0)}") + "\n")
     with nlp.select_pipes(disable=frozen_components):
         log_step, finalize_logger = train_logger(nlp, stdout, stderr)
     try:

From 7c8fa0e58f474842d1e19bb3fb4e32301e36fcc3 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 11 Jan 2023 18:57:50 +0100
Subject: [PATCH 033/504] update tests from master to follow v4 principles

---
 spacy/tests/pipeline/test_entity_ruler.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 9f5204006ec..ae57da5134c 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -357,9 +357,9 @@ def test_entity_ruler_overlapping_spans(nlp):
     assert doc.ents[0].label_ == "FOOBAR"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+@pytest.mark.parametrize()
+def test_entity_ruler_fuzzy_pipe(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
     ruler.add_patterns(patterns)
     doc = nlp("helloo")
@@ -367,9 +367,9 @@ def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fuzzy(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+@pytest.mark.parametrize()
+def test_entity_ruler_fuzzy(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
     ruler.add_patterns(patterns)
     doc = nlp("helloo")
@@ -377,15 +377,14 @@ def test_entity_ruler_fuzzy(nlp, entity_ruler_factory):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fuzzy_disabled(nlp, entity_ruler_factory):
+@pytest.mark.parametrize()
+def test_entity_ruler_fuzzy_disabled(nlp):
     @registry.misc("test_fuzzy_compare_disabled")
     def make_test_fuzzy_compare_disabled():
         return lambda x, y, z: False
 
     ruler = nlp.add_pipe(
-        entity_ruler_factory,
-        name="entity_ruler",
+        "entity_ruler",
         config={"matcher_fuzzy_compare": {"@misc": "test_fuzzy_compare_disabled"}},
     )
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]

From 38c54ea6410f55a9ff1f402f193da5f27347e22e Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 11 Jan 2023 19:04:06 +0100
Subject: [PATCH 034/504] update tests from master to follow v4 principles (2)

---
 spacy/tests/pipeline/test_entity_ruler.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index ae57da5134c..6bff3288dc3 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -357,7 +357,6 @@ def test_entity_ruler_overlapping_spans(nlp):
     assert doc.ents[0].label_ == "FOOBAR"
 
 
-@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy_pipe(nlp):
     ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
@@ -367,7 +366,6 @@ def test_entity_ruler_fuzzy_pipe(nlp):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy(nlp):
     ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
@@ -377,7 +375,6 @@ def test_entity_ruler_fuzzy(nlp):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy_disabled(nlp):
     @registry.misc("test_fuzzy_compare_disabled")
     def make_test_fuzzy_compare_disabled():

From dbeaea98224f25355405ad606f3cc181c6ecaeec Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 13 Jan 2023 11:14:58 +0100
Subject: [PATCH 035/504] fix anchors (#12095)

---
 website/docs/api/stringstore.mdx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index d4d85e6d56a..269ac2d0c4b 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -97,7 +97,7 @@ Iterate over the stored strings in insertion order.
 | ----------- | ------------------------------ |
 | **RETURNS** | A string in the store. ~~str~~ |
 
-## StringStore.items {#iter tag="method" new="4"}
+## StringStore.items {id="items", tag="method", version="4"}
 
 Iterate over the stored string-hash pairs in insertion order.
 
@@ -113,7 +113,7 @@ Iterate over the stored string-hash pairs in insertion order.
 | ----------- | ------------------------------------------------------ |
 | **RETURNS** | A list of string-hash pairs. ~~List[Tuple[str, int]]~~ |
 
-## StringStore.keys {#iter tag="method" new="4"}
+## StringStore.keys {id="keys", tag="method", version="4"}
 
 Iterate over the stored strings in insertion order.
 
@@ -129,7 +129,7 @@ Iterate over the stored strings in insertion order.
 | ----------- | -------------------------------- |
 | **RETURNS** | A list of strings. ~~List[str]~~ |
 
-## StringStore.values {#iter tag="method" new="4"}
+## StringStore.values {id="values", tag="method", version="4"}
 
 Iterate over the stored string hashes in insertion order.
 

From c7693c468ba02a9c56cd9f419c8fd7a7a6e38cde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 16 Jan 2023 10:25:53 +0100
Subject: [PATCH 036/504] Add
 `TrainablePipe.{distill,get_teacher_student_loss}` (#12016)

* Add `TrainablePipe.{distill,get_teacher_student_loss}`

This change adds two methods:

- `TrainablePipe::distill` which performs a training step of a
   student pipe on a teacher pipe, giving a batch of `Doc`s.
- `TrainablePipe::get_teacher_student_loss` computes the loss
  of a student relative to the teacher.

The `distill` or `get_teacher_student_loss` methods are also implemented
in the tagger, edit tree lemmatizer, and parser pipes, to enable
distillation in those pipes and as an example for other pipes.

* Fix stray `Beam` import

* Fix incorrect import

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* TrainablePipe.distill: use `Iterable[Example]`

* Add Pipe.is_distillable method

* Add `validate_distillation_examples`

This first calls `validate_examples` and then checks that the
student/teacher tokens are the same.

* Update distill documentation

* Add distill documentation for all pipes that support distillation

* Fix incorrect identifier

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Add comment to explain `is_distillable`

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/errors.py                               |   3 +
 spacy/ml/callbacks.py                         |   1 +
 spacy/pipeline/edit_tree_lemmatizer.py        |  19 +++
 spacy/pipeline/pipe.pyx                       |   4 +
 spacy/pipeline/tagger.pyx                     |  26 ++-
 spacy/pipeline/trainable_pipe.pyx             |  72 +++++++-
 spacy/pipeline/transition_parser.pyx          | 160 +++++++++++++++++-
 spacy/tests/parser/test_ner.py                |  46 +++++
 spacy/tests/parser/test_parse.py              |  49 ++++++
 .../pipeline/test_edit_tree_lemmatizer.py     |  47 +++++
 spacy/tests/pipeline/test_morphologizer.py    |   6 +
 spacy/tests/pipeline/test_senter.py           |   6 +
 spacy/tests/pipeline/test_tagger.py           |  46 +++++
 spacy/tests/pipeline/test_textcat.py          |   6 +
 spacy/tests/training/test_training.py         |  27 +--
 spacy/training/__init__.py                    |   3 +
 spacy/training/example.pyx                    |   7 +
 website/docs/api/dependencyparser.mdx         |  54 ++++++
 website/docs/api/edittreelemmatizer.mdx       |  54 ++++++
 website/docs/api/entityrecognizer.mdx         |  54 ++++++
 website/docs/api/morphologizer.mdx            |  54 ++++++
 website/docs/api/pipe.mdx                     |  61 +++++++
 website/docs/api/sentencerecognizer.mdx       |  54 ++++++
 website/docs/api/tagger.mdx                   |  54 ++++++
 website/docs/api/top-level.mdx                |   3 +-
 website/docs/usage/processing-pipelines.mdx   |  14 +-
 26 files changed, 906 insertions(+), 24 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 11b8980fd9d..9bdb66006e5 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -975,6 +975,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E4000 = ("Expected a Doc as input, but got: '{type}'")
     E4001 = ("Expected input to be one of the following types: ({expected_types}), "
              "but got '{received_type}'")
+    E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
+    E4003 = ("Training examples for distillation must have the exact same tokens in the "
+             "reference and predicted docs.")
 
 
 # fmt: on
diff --git a/spacy/ml/callbacks.py b/spacy/ml/callbacks.py
index e2378a7baf3..0783a5568a9 100644
--- a/spacy/ml/callbacks.py
+++ b/spacy/ml/callbacks.py
@@ -23,6 +23,7 @@
     "update",
     "rehearse",
     "get_loss",
+    "get_teacher_student_loss",
     "initialize",
     "begin_update",
     "finish_update",
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index f9a8ae10561..d5169178b8c 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -157,6 +157,25 @@ def get_loss(
 
         return float(loss), d_scores
 
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
     def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         n_docs = len(list(docs))
         if not any(len(doc) for doc in docs):
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index ea5fc5253d9..af7cd09f171 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -89,6 +89,10 @@ cdef class Pipe:
             return self.scorer(examples, **scorer_kwargs)
         return {}
 
+    @property
+    def is_distillable(self) -> bool:
+        return False
+
     @property
     def is_trainable(self) -> bool:
         return False
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index f25ee00407b..a8a89332bd4 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,5 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Tuple
 import numpy
 import srsly
 from thinc.api import Model, set_dropout_rate, Config
@@ -243,7 +244,6 @@ class Tagger(TrainablePipe):
 
         DOCS: https://spacy.io/api/tagger#rehearse
         """
-        loss_func = LegacySequenceCategoricalCrossentropy()
         if losses is None:
             losses = {}
         losses.setdefault(self.name, 0.0)
@@ -257,12 +257,32 @@ class Tagger(TrainablePipe):
         set_dropout_rate(self.model, drop)
         tag_scores, bp_tag_scores = self.model.begin_update(docs)
         tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
-        grads, loss = loss_func(tag_scores, tutor_tag_scores)
+        loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
         bp_tag_scores(grads)
-        self.finish_update(sgd)
+        if sgd is not None:
+            self.finish_update(sgd)
         losses[self.name] += loss
         return losses
 
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
     def get_loss(self, examples, scores):
         """Find the loss and gradient of loss for the batch of documents and
         their predicted scores.
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index bd360c9501b..3ec3e7551aa 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -7,7 +7,7 @@ import warnings
 
 from ..tokens.doc cimport Doc
 
-from ..training import validate_examples
+from ..training import validate_examples, validate_distillation_examples
 from ..errors import Errors, Warnings
 from .pipe import Pipe, deserialize_config
 from .. import util
@@ -59,7 +59,54 @@ cdef class TrainablePipe(Pipe):
         except Exception as e:
             error_handler(self.name, self, [doc], e)
 
-    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
+
+    def distill(self,
+               teacher_pipe: Optional["TrainablePipe"],
+               examples: Iterable["Example"],
+               *,
+               drop: float=0.0,
+               sgd: Optional[Optimizer]=None,
+               losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
+        """Train a pipe (the student) on the predictions of another pipe
+        (the teacher). The student is typically trained on the probability
+        distribution of the teacher, but details may differ per pipe.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
+            from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            and predicted docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+        
+        DOCS: https://spacy.io/api/pipe#distill
+        """
+        # By default we require a teacher pipe, but there are downstream
+        # implementations that don't require a pipe.
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        validate_distillation_examples(examples, "TrainablePipe.distill")
+        set_dropout_rate(self.model, drop)
+        for node in teacher_pipe.model.walk():
+            if node.name == "softmax":
+                node.attrs["softmax_normalize"] = True
+        teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples])
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+        bp_student_scores(d_scores)
+        if sgd is not None:
+            self.finish_update(sgd)
+        losses[self.name] += loss
+        return losses
+
+    def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
         """Apply the pipe to a stream of documents. This usually happens under
         the hood when the nlp object is called on a text and all components are
         applied to the Doc.
@@ -172,6 +219,19 @@ cdef class TrainablePipe(Pipe):
         """
         raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
 
+    def get_teacher_student_loss(self, teacher_scores, student_scores):
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
+        """
+        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
+
     def create_optimizer(self) -> Optimizer:
         """Create an optimizer for the pipeline component.
 
@@ -208,6 +268,14 @@ cdef class TrainablePipe(Pipe):
         """
         raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
 
+    @property
+    def is_distillable(self) -> bool:
+        # Normally a pipe overrides `get_teacher_student_loss` to implement
+        # distillation. In more exceptional cases, a pipe can provide its
+        # own `distill` implementation. If neither of these methods is
+        # overridden, the pipe does not implement distillation.
+        return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss)
+
     @property
     def is_trainable(self) -> bool:
         return True
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index d310df92151..feab7e7404b 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,7 +1,8 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
-
+from typing import Dict, Iterable, List, Optional, Tuple
+from cymem.cymem cimport Pool
 cimport numpy as np
 from cymem.cymem cimport Pool
 
@@ -14,7 +15,10 @@ from libcpp.vector cimport vector
 import random
 
 import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
+from thinc.api import chain, softmax_activation, use_ops
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d
 import numpy.random
 import numpy
 import numpy.random
@@ -34,6 +38,9 @@ from .trainable_pipe import TrainablePipe
 
 from ._parser_internals cimport _beam_utils
 
+from ..training import validate_examples, validate_get_examples
+from ..training import validate_distillation_examples
+from ..errors import Errors, Warnings
 from .. import util
 from ..errors import Errors
 from ..training import validate_examples, validate_get_examples
@@ -212,6 +219,121 @@ cdef class Parser(TrainablePipe):
         # Defined in subclasses, to avoid circular import
         raise NotImplementedError
 
+    def distill(self,
+               teacher_pipe: Optional[TrainablePipe],
+               examples: Iterable["Example"],
+               *,
+               drop: float=0.0,
+               sgd: Optional[Optimizer]=None,
+               losses: Optional[Dict[str, float]]=None):
+        """Train a pipe (the student) on the predictions of another pipe
+        (the teacher). The student is trained on the transition probabilities
+        of the teacher.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
+            from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            and predicted docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+        
+        DOCS: https://spacy.io/api/dependencyparser#distill
+        """
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+
+        validate_distillation_examples(examples, "TransitionParser.distill")
+
+        set_dropout_rate(self.model, drop)
+
+        student_docs = [eg.predicted for eg in examples]
+
+        teacher_step_model = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_step_model, backprop_tok2vec = self.model.begin_update(student_docs)
+
+        # Add softmax activation, so that we can compute student losses
+        # with cross-entropy loss.
+        with use_ops("numpy"):
+            teacher_model = chain(teacher_step_model, softmax_activation())
+            student_model = chain(student_step_model, softmax_activation())
+        
+        max_moves = self.cfg["update_with_oracle_cut_size"]
+        if max_moves >= 1:
+            # Chop sequences into lengths of this many words, to make the
+            # batch uniform length. Since we do not have a gold standard
+            # sequence, we use the teacher's predictions as the gold
+            # standard.
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            states = self._init_batch(teacher_step_model, student_docs, max_moves)
+        else:
+            states = self.moves.init_batch(student_docs)
+
+        loss = 0.0
+        n_moves = 0
+        while states:
+            # We do distillation as follows: (1) for every state, we compute the
+            # transition softmax distributions: (2) we backpropagate the error of
+            # the student (compared to the teacher) into the student model; (3)
+            # for all states, we move to the next state using the student's
+            # predictions.
+            teacher_scores = teacher_model.predict(states)
+            student_scores, backprop = student_model.begin_update(states)
+            state_loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+            backprop(d_scores)
+            loss += state_loss
+            self.transition_states(states, student_scores)
+            states = [state for state in states if not state.is_final()]
+
+            # Stop when we reach the maximum number of moves, otherwise we start
+            # to process the remainder of cut sequences again.
+            if max_moves >= 1 and n_moves >= max_moves:
+                break
+            n_moves += 1
+
+        backprop_tok2vec(student_docs)
+
+        if sgd is not None:
+            self.finish_update(sgd)
+
+        losses[self.name] += loss
+
+        del backprop
+        del backprop_tok2vec
+        teacher_step_model.clear_memory()
+        student_step_model.clear_memory()
+        del teacher_model
+        del student_model
+
+        return losses
+
+
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
     def init_multitask_objectives(self, get_examples, pipeline, **cfg):
         """Setup models for secondary objectives, to benefit from multi-task
         learning. This method is intended to be overridden by subclasses.
@@ -645,6 +767,40 @@ cdef class Parser(TrainablePipe):
                     raise ValueError(Errors.E149) from None
         return self
 
+    def _init_batch(self, teacher_step_model, docs, max_length):
+        """Make a square batch of length equal to the shortest transition
+        sequence or a cap. A long
+        doc will get multiple states. Let's say we have a doc of length 2*N,
+        where N is the shortest doc. We'll make two states, one representing
+        long_doc[:N], and another representing long_doc[N:]. In contrast to
+        _init_gold_batch, this version uses a teacher model to generate the
+        cut sequences."""
+        cdef:
+            StateClass start_state
+            StateClass state
+            Transition action
+        all_states = self.moves.init_batch(docs)
+        states = []
+        to_cut = []
+        for state, doc in zip(all_states, docs):
+            if not state.is_final():
+                if len(doc) < max_length:
+                    states.append(state)
+                else:
+                    to_cut.append(state)
+        while to_cut:
+            states.extend(state.copy() for state in to_cut)
+            # Move states forward max_length actions.
+            length = 0
+            while to_cut and length < max_length:
+                teacher_scores = teacher_step_model.predict(to_cut)
+                self.transition_states(to_cut, teacher_scores)
+                # States that are completed do not need further cutting.
+                to_cut = [state for state in to_cut if not state.is_final()]
+                length += 1
+        return states
+
+
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
         sequence or a cap. A long
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 1509c31bbba..54ee053981f 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -617,6 +617,52 @@ def test_overfitting_IO(use_upper):
     assert ents[1].kb_id == 0
 
 
+def test_is_distillable():
+    nlp = English()
+    ner = nlp.add_pipe("ner")
+    assert ner.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_ner = teacher.add_pipe("ner")
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
+        for ent in annotations.get("entities"):
+            teacher_ner.add_label(ent[2])
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["ner"] < 0.00001
+
+    student = English()
+    student_ner = student.add_pipe("ner")
+    student_ner.initialize(
+        get_examples=lambda: train_examples, labels=teacher_ner.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(100):
+        losses = {}
+        student_ner.distill(teacher_ner, distill_examples, sgd=optimizer, losses=losses)
+    assert losses["ner"] < 0.0001
+
+    # test the trained model
+    test_text = "I like London."
+    doc = student(test_text)
+    ents = doc.ents
+    assert len(ents) == 1
+    assert ents[0].text == "London"
+    assert ents[0].label_ == "LOC"
+
+
 def test_beam_ner_scores():
     # Test that we can get confidence values out of the beam_ner pipe
     beam_width = 16
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 3565c62af0f..a943c3538e0 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -396,6 +396,55 @@ def test_overfitting_IO(pipe_name):
     assert_equal(batch_deps_1, no_batch_deps)
 
 
+def test_is_distillable():
+    nlp = English()
+    parser = nlp.add_pipe("parser")
+    assert parser.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_parser = teacher.add_pipe("parser")
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
+        for dep in annotations.get("deps", []):
+            teacher_parser.add_label(dep)
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(200):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["parser"] < 0.0001
+
+    student = English()
+    student_parser = student.add_pipe("parser")
+    student_parser.initialize(
+        get_examples=lambda: train_examples, labels=teacher_parser.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(200):
+        losses = {}
+        student_parser.distill(
+            teacher_parser, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["parser"] < 0.0001
+
+    test_text = "I like securities."
+    doc = student(test_text)
+    assert doc[0].dep_ == "nsubj"
+    assert doc[2].dep_ == "dobj"
+    assert doc[3].dep_ == "punct"
+    assert doc[0].head.i == 1
+    assert doc[2].head.i == 1
+    assert doc[3].head.i == 1
+
+
 # fmt: off
 @pytest.mark.slow
 @pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index ba2ed4e5ff3..0f204ead477 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -214,6 +214,53 @@ def test_overfitting_IO(top_k):
     assert doc4[3].lemma_ == "egg"
 
 
+def test_is_distillable():
+    nlp = English()
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    assert lemmatizer.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_lemmatizer = teacher.add_pipe("trainable_lemmatizer")
+    teacher_lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["trainable_lemmatizer"] < 0.00001
+
+    student = English()
+    student_lemmatizer = student.add_pipe("trainable_lemmatizer")
+    student_lemmatizer.min_tree_freq = 1
+    student_lemmatizer.initialize(
+        get_examples=lambda: train_examples, labels=teacher_lemmatizer.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(50):
+        losses = {}
+        student_lemmatizer.distill(
+            teacher_lemmatizer, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["trainable_lemmatizer"] < 0.00001
+
+    test_text = "She likes blue eggs"
+    doc = student(test_text)
+    assert doc[0].lemma_ == "she"
+    assert doc[1].lemma_ == "like"
+    assert doc[2].lemma_ == "blue"
+    assert doc[3].lemma_ == "egg"
+
+
 def test_lemmatizer_requires_labels():
     nlp = English()
     nlp.add_pipe("trainable_lemmatizer")
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index c2b65977ac3..fffb7b4ed7f 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -77,6 +77,12 @@ def test_implicit_label():
     nlp.initialize(get_examples=lambda: train_examples)
 
 
+def test_is_distillable():
+    nlp = English()
+    morphologizer = nlp.add_pipe("morphologizer")
+    assert morphologizer.is_distillable
+
+
 def test_no_resize():
     nlp = Language()
     morphologizer = nlp.add_pipe("morphologizer")
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 2e40d86ff48..94285178310 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -11,6 +11,12 @@
 from spacy.training import Example
 
 
+def test_is_distillable():
+    nlp = English()
+    senter = nlp.add_pipe("senter")
+    assert senter.is_distillable
+
+
 def test_label_types():
     nlp = Language()
     senter = nlp.add_pipe("senter")
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 5deb323dd71..5da5c209975 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -237,6 +237,52 @@ def test_overfitting_IO():
     assert doc3[0].tag_ != "N"
 
 
+def test_is_distillable():
+    nlp = English()
+    tagger = nlp.add_pipe("tagger")
+    assert tagger.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_tagger = teacher.add_pipe("tagger")
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["tagger"] < 0.00001
+
+    student = English()
+    student_tagger = student.add_pipe("tagger")
+    student_tagger.min_tree_freq = 1
+    student_tagger.initialize(
+        get_examples=lambda: train_examples, labels=teacher_tagger.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(50):
+        losses = {}
+        student_tagger.distill(
+            teacher_tagger, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["tagger"] < 0.00001
+
+    test_text = "I like blue eggs"
+    doc = student(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+
 def test_save_activations():
     # Test if activations are correctly added to Doc when requested.
     nlp = English()
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 710dac0571d..214c1bfbed1 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -602,6 +602,12 @@ def test_initialize_examples(name, get_examples, train_data):
         nlp.initialize(get_examples=get_examples())
 
 
+def test_is_distillable():
+    nlp = English()
+    textcat = nlp.add_pipe("textcat")
+    assert not textcat.is_distillable
+
+
 def test_overfitting_IO():
     # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
     fix_random_seed(0)
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index a492a8be358..68f7e8a0d57 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -8,16 +8,10 @@
 import spacy
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
-from spacy.training import (
-    Alignment,
-    Corpus,
-    Example,
-    biluo_tags_to_offsets,
-    biluo_tags_to_spans,
-    docs_to_json,
-    iob_to_biluo,
-    offsets_to_biluo_tags,
-)
+from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
+from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
+from spacy.training import offsets_to_biluo_tags, validate_distillation_examples
+from spacy.training.alignment_array import AlignmentArray
 from spacy.training.align import get_alignments
 from spacy.training.alignment_array import AlignmentArray
 from spacy.training.converters import json_to_docs
@@ -377,6 +371,19 @@ def test_example_from_dict_some_ner(en_vocab):
     assert ner_tags == ["U-LOC", None, None, None]
 
 
+def test_validate_distillation_examples(en_vocab):
+    words = ["a", "b", "c", "d"]
+    spaces = [True, True, False, True]
+    predicted = Doc(en_vocab, words=words, spaces=spaces)
+
+    example = Example.from_dict(predicted, {})
+    validate_distillation_examples([example], "test_validate_distillation_examples")
+
+    example = Example.from_dict(predicted, {"words": words + ["e"]})
+    with pytest.raises(ValueError, match=r"distillation"):
+        validate_distillation_examples([example], "test_validate_distillation_examples")
+
+
 @pytest.mark.filterwarnings("ignore::UserWarning")
 def test_json_to_docs_no_ner(en_vocab):
     data = [
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 5c2ba99320d..358b2bd806d 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,3 +1,6 @@
+from .corpus import Corpus, JsonlCorpus  # noqa: F401
+from .example import Example, validate_examples, validate_get_examples  # noqa: F401
+from .example import validate_distillation_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index abdcecf71d1..914e877f579 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -57,6 +57,13 @@ def validate_examples(examples, method):
         raise TypeError(err)
 
 
+def validate_distillation_examples(examples, method):
+    validate_examples(examples, method)
+    for eg in examples:
+        if [token.text for token in eg.reference] != [token.text for token in eg.predicted]:
+            raise ValueError(Errors.E4003)
+
+
 def validate_get_examples(get_examples, method):
     """Check that a generator of a batch of examples received during processing is valid:
     the callable produces a non-empty list of Example objects.
diff --git a/website/docs/api/dependencyparser.mdx b/website/docs/api/dependencyparser.mdx
index 771a00aeee1..5179ce48b84 100644
--- a/website/docs/api/dependencyparser.mdx
+++ b/website/docs/api/dependencyparser.mdx
@@ -131,6 +131,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## DependencyParser.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("parser")
+> student_pipe = student.add_pipe("parser")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## DependencyParser.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -268,6 +301,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions. ~~StateClass~~                 |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## DependencyParser.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_parser = teacher.get_pipe("parser")
+> student_parser = student.add_pipe("parser")
+> student_scores = student_parser.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_parser.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_parser.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## DependencyParser.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline
diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx
index 17af19e8c38..2e099365758 100644
--- a/website/docs/api/edittreelemmatizer.mdx
+++ b/website/docs/api/edittreelemmatizer.mdx
@@ -115,6 +115,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## EditTreeLemmatizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("trainable_lemmatizer")
+> student_pipe = student.add_pipe("trainable_lemmatizer")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## EditTreeLemmatizer.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -269,6 +302,27 @@ Create an optimizer for the pipeline component.
 | ----------- | ---------------------------- |
 | **RETURNS** | The optimizer. ~~Optimizer~~ |
 
+## EditTreeLemmatizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_lemmatizer = teacher.get_pipe("trainable_lemmatizer")
+> student_lemmatizer = student.add_pipe("trainable_lemmatizer")
+> student_scores = student_lemmatizer.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_lemmatizer.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_lemmatizer.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## EditTreeLemmatizer.use_params {id="use_params",tag="method, contextmanager"}
 
 Modify the pipe's model, to use the given parameter values. At the end of the
diff --git a/website/docs/api/entityrecognizer.mdx b/website/docs/api/entityrecognizer.mdx
index 1f386bbb6ff..005d5d11deb 100644
--- a/website/docs/api/entityrecognizer.mdx
+++ b/website/docs/api/entityrecognizer.mdx
@@ -127,6 +127,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## EntityRecognizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("ner")
+> student_pipe = student.add_pipe("ner")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## EntityRecognizer.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -264,6 +297,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions. ~~StateClass~~                 |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## EntityRecognizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_ner = teacher.get_pipe("ner")
+> student_ner = student.add_pipe("ner")
+> student_scores = student_ner.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_ner.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_ner.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## EntityRecognizer.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 1fda807cb32..4f79458d319 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -121,6 +121,39 @@ delegate to the [`predict`](/api/morphologizer#predict) and
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## Morphologizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("morphologizer")
+> student_pipe = student.add_pipe("morphologizer")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## Morphologizer.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -259,6 +292,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## Morphologizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_morphologizer = teacher.get_pipe("morphologizer")
+> student_morphologizer = student.add_pipe("morphologizer")
+> student_scores = student_morphologizer.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_morphologizer.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_morphologizer.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## Morphologizer.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx
index b387ea58654..120c8f6908f 100644
--- a/website/docs/api/pipe.mdx
+++ b/website/docs/api/pipe.mdx
@@ -234,6 +234,39 @@ predictions and gold-standard annotations, and update the component's model.
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
 | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
 
+## TrainablePipe.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("your_custom_pipe")
+> student_pipe = student.add_pipe("your_custom_pipe")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## TrainablePipe.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
@@ -281,6 +314,34 @@ This method needs to be overwritten with your own custom `get_loss` method.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## TrainablePipe.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+<Infobox variant="danger">
+
+This method needs to be overwritten with your own custom
+`get_teacher_student_loss` method.
+
+</Infobox>
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.get_pipe("your_custom_pipe")
+> student_pipe = student.add_pipe("your_custom_pipe")
+> student_scores = student_pipe.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_pipe.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_pipe.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## TrainablePipe.score {id="score",tag="method",version="3"}
 
 Score a batch of examples.
diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx
index d5d096d7659..02fd57102e2 100644
--- a/website/docs/api/sentencerecognizer.mdx
+++ b/website/docs/api/sentencerecognizer.mdx
@@ -106,6 +106,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## SentenceRecognizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("senter")
+> student_pipe = student.add_pipe("senter")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## SentenceRecognizer.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -254,6 +287,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## SentenceRecognizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_senter = teacher.get_pipe("senter")
+> student_senter = student.add_pipe("senter")
+> student_scores = student_senter.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_senter.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_senter.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## SentenceRecognizer.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index ae14df212ee..664fd7940c1 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -105,6 +105,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## Tagger.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("tagger")
+> student_pipe = student.add_pipe("tagger")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## Tagger.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -265,6 +298,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## Tagger.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_tagger = teacher.get_pipe("tagger")
+> student_tagger = student.add_pipe("tagger")
+> student_scores = student_tagger.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_tagger.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_tagger.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## Tagger.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 9cdc0c8ab02..77216924405 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -1048,7 +1048,8 @@ backprop passes.
 Recursively wrap both the models and methods of each pipe using
 [NVTX](https://nvidia.github.io/NVTX/) range markers. By default, the following
 methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`,
-`get_loss`, `initialize`, `begin_update`, `finish_update`, `update`.
+`get_loss`, `get_teacher_student_loss`, `initialize`, `begin_update`,
+`finish_update`, `update`.
 
 | Name                        | Description                                                                                                                                                     |
 | --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index fb5de5da102..9dbdadd0ebc 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -1355,12 +1355,14 @@ For some use cases, it makes sense to also overwrite additional methods to
 customize how the model is updated from examples, how it's initialized, how the
 loss is calculated and to add evaluation scores to the training output.
 
-| Name                                 | Description                                                                                                                                                                                                                                                                                                                                   |
-| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`update`](/api/pipe#update)         | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                                           |
-| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
-| [`get_loss`](/api/pipe#get_loss)     | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                                                 |
-| [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_score_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score.                            |
+| Name                                                             | Description                                                                                                                                                                                                                                                                                                                                   |
+| ---------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`update`](/api/pipe#update)                                     | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                                           |
+| [`distill`](/api/pipe#distill)                                   | Learn from a teacher pipeline using a batch of [`Doc`](/api/doc) objects and update the component's model.                                                                                                                                                                                                                                    |
+| [`initialize`](/api/pipe#initialize)                             | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
+| [`get_loss`](/api/pipe#get_loss)                                 | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                                                 |
+| [`get_teacher_student_loss`](/api/pipe#get_teacher_student_loss) | Return a tuple of the loss and the gradient for the student scores relative to the teacher scores.                                                                                                                                                                                                                                            |
+| [`score`](/api/pipe#score)                                       | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_score_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score.                            |
 
 <Infobox title="Custom trainable components and models" emoji="📖">
 

From b2608b3fab277f69d669158514d00dbd49c73295 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 18 Jan 2023 11:27:45 +0100
Subject: [PATCH 037/504] Merge the parser refactor into `v4` (#10940)

* Try to fix doc.copy

* Set dev version

* Make vocab always own lexemes

* Change version

* Add SpanGroups.copy method

* Fix set_annotations during Parser.update

* Fix dict proxy copy

* Upd version

* Fix copying SpanGroups

* Fix set_annotations in parser.update

* Fix parser set_annotations during update

* Revert "Fix parser set_annotations during update"

This reverts commit eb138c89edb306608826dca50619ea8a60de2b14.

* Revert "Fix set_annotations in parser.update"

This reverts commit c6df0eafd0046179c1c9fb7840074edf04e4721d.

* Fix set_annotations during parser update

* Inc version

* Handle final states in get_oracle_sequence

* Inc version

* Try to fix parser training

* Inc version

* Fix

* Inc version

* Fix parser oracle

* Inc version

* Inc version

* Fix transition has_gold

* Inc version

* Try to use real histories, not oracle

* Inc version

* Upd parser

* Inc version

* WIP on rewrite parser

* WIP refactor parser

* New progress on parser model refactor

* Prepare to remove parser_model.pyx

* Convert parser from cdef class

* Delete spacy.ml.parser_model

* Delete _precomputable_affine module

* Wire up tb_framework to new parser model

* Wire up parser model

* Uncython ner.pyx and dep_parser.pyx

* Uncython

* Work on parser model

* Support unseen_classes in parser model

* Support unseen classes in parser

* Cleaner handling of unseen classes

* Work through tests

* Keep working through errors

* Keep working through errors

* Work on parser. 15 tests failing

* Xfail beam stuff. 9 failures

* More xfail. 7 failures

* Xfail. 6 failures

* cleanup

* formatting

* fixes

* pass nO through

* Fix empty doc in update

* Hackishly fix resizing. 3 failures

* Fix redundant test. 2 failures

* Add reference version

* black formatting

* Get tests passing with reference implementation

* Fix missing prints

* Add missing file

* Improve indexing on reference implementation

* Get non-reference forward func working

* Start rigging beam back up

* removing redundant tests, cf #8106

* black formatting

* temporarily xfailing issue 4314

* make flake8 happy again

* mypy fixes

* ensure labels are added upon predict

* cleanup remnants from merge conflicts

* Improve unseen label masking

Two changes to speed up masking by ~10%:

- Use a bool array rather than an array of float32.

- Let the mask indicate whether a label was seen, rather than
  unseen. The mask is most frequently used to index scores for
  seen labels. However, since the mask marked unseen labels,
  this required computing an intermittent flipped mask.

* Write moves costs directly into numpy array (#10163)

This avoids elementwise indexing and the allocation of an additional
array.

Gives a ~15% speed improvement when using batch_by_sequence with size
32.

* Temporarily disable ner and rehearse tests

Until rehearse is implemented again in the refactored parser.

* Fix loss serialization issue (#10600)

* Fix loss serialization issue

Serialization of a model fails with:

TypeError: array(738.3855, dtype=float32) is not JSON serializable

Fix this using float conversion.

* Disable CI steps that require spacy.TransitionBasedParser.v2

After finishing the refactor, TransitionBasedParser.v2 should be
provided for backwards compat.

* Add back support for beam parsing to the refactored parser (#10633)

* Add back support for beam parsing

Beam parsing was already implemented as part of the `BeamBatch` class.
This change makes its counterpart `GreedyBatch`. Both classes are hooked
up in `TransitionModel`, selecting `GreedyBatch` when the beam size is
one, or `BeamBatch` otherwise.

* Use kwarg for beam width

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Avoid implicit default for beam_width and beam_density

* Parser.{beam,greedy}_parse: ensure labels are added

* Remove 'deprecated' comments

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Parser `StateC` optimizations (#10746)

* `StateC`: Optimizations

Avoid GIL acquisition in `__init__`
Increase default buffer capacities on init
Reduce C++ exception overhead

* Fix typo

* Replace `set::count` with `set::find`

* Add exception attribute to c'tor

* Remove unused import

* Use a power-of-two value for initial capacity
Use default-insert to init `_heads` and `_unshiftable`

* Merge `cdef` variable declarations and assignments

* Vectorize `example.get_aligned_parses` (#10789)

* `example`: Vectorize `get_aligned_parse`
Rename `numpy` import

* Convert aligned array to lists before returning

* Revert import renaming

* Elide slice arguments when selecting the entire range

* Tagger/morphologizer alignment performance optimizations (#10798)

* `example`: Unwrap `numpy` scalar arrays before passing them to `StringStore.__getitem__`

* `AlignmentArray`: Use native list as staging buffer for offset calculation

* `example`: Vectorize `get_aligned`

* Hoist inner functions out of `get_aligned`

* Replace inline `if..else` clause in assignment statement

* `AlignmentArray`: Use raw indexing into offset and data `numpy` arrays

* `example`: Replace array unique value check with `groupby`

* `example`: Correctly exclude tokens with no alignment in `_get_aligned_vectorized`
Simplify `_get_aligned_non_vectorized`

* `util`: Update `all_equal` docstring

* Explicitly use `int32_t*`

* Restore C CPU inference in the refactored parser (#10747)

* Bring back the C parsing model

The C parsing model is used for CPU inference and is still faster for
CPU inference than the forward pass of the Thinc model.

* Use C sgemm provided by the Ops implementation

* Make tb_framework module Cython, merge in C forward implementation

* TransitionModel: raise in backprop returned from forward_cpu

* Re-enable greedy parse test

* Return transition scores when forward_cpu is used

* Apply suggestions from code review

Import `Model` from `thinc.api`

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Use relative imports in tb_framework

* Don't assume a default for beam_width

* We don't have a direct dependency on BLIS anymore

* Rename forwards to _forward_{fallback,greedy_cpu}

* Require thinc >=8.1.0,<8.2.0

* tb_framework: clean up imports

* Fix return type of _get_seen_mask

* Move up _forward_greedy_cpu

* Style fixes.

* Lower thinc lowerbound to 8.1.0.dev0

* Formatting fix

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Reimplement parser rehearsal function (#10878)

* Reimplement parser rehearsal function

Before the parser refactor, rehearsal was driven by a loop in the
`rehearse` method itself. For each parsing step, the loops would:

1. Get the predictions of the teacher.
2. Get the predictions and backprop function of the student.
3. Compute the loss and backprop into the student.
4. Move the teacher and student forward with the predictions of
   the student.

In the refactored parser, we cannot perform search stepwise rehearsal
anymore, since the model now predicts all parsing steps at once.
Therefore, rehearsal is performed in the following steps:

1. Get the predictions of all parsing steps from the student, along
   with its backprop function.
2. Get the predictions from the teacher, but use the predictions of
   the student to advance the parser while doing so.
3. Compute the loss and backprop into the student.

To support the second step a new method, `advance_with_actions` is
added to `GreedyBatch`, which performs the provided parsing steps.

* tb_framework: wrap upper_W and upper_b in Linear

Thinc's Optimizer cannot handle resizing of existing parameters. Until
it does, we work around this by wrapping the weights/biases of the upper
layer of the parser model in Linear. When the upper layer is resized, we
copy over the existing parameters into a new Linear instance. This does
not trigger an error in Optimizer, because it sees the resized layer as
a new set of parameters.

* Add test for TransitionSystem.apply_actions

* Better FIXME marker

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Fixes from Madeesh

* Apply suggestions from Sofie

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remove useless assignment

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename some identifiers in the parser refactor (#10935)

* Rename _parseC to _parse_batch

* tb_framework: prefix many auxiliary functions with underscore

To clearly state the intent that they are private.

* Rename `lower` to `hidden`, `upper` to `output`

* Parser slow test fixup

We don't have TransitionBasedParser.{v1,v2} until we bring it back as a
legacy option.

* Remove last vestiges of PrecomputableAffine

This does not exist anymore as a separate layer.

* ner: re-enable sentence boundary checks

* Re-enable test that works now.

* test_ner: make loss test more strict again

* Remove commented line

* Re-enable some more beam parser tests

* Remove unused _forward_reference function

* Update for CBlas changes in Thinc 8.1.0.dev2

Bump thinc dependency to 8.1.0.dev3.

* Remove references to spacy.TransitionBasedParser.{v1,v2}

Since they will not be offered starting with spaCy v4.

* `tb_framework`: Replace references to `thinc.backends.linalg` with `CBlas`

* dont use get_array_module (#11056) (#11293)

Co-authored-by: kadarakos <kadar.akos@gmail.com>

* Move `thinc.extra.search` to `spacy.pipeline._parser_internals` (#11317)

* `search`: Move from `thinc.extra.search`
Fix NPE in `Beam.__dealloc__`

* `pytest`: Add support for executing Cython tests
Move `search` tests from thinc and patch them to run with `pytest`

* `mypy` fix

* Update comment

* `conftest`: Expose `register_cython_tests`

* Remove unused import

* Move `argmax` impls to new `_parser_utils` Cython module (#11410)

* Parser does not have to be a cdef class anymore

This also fixes validation of the initialization schema.

* Add back spacy.TransitionBasedParser.v2

* Fix a rename that was missed in #10878.

So that rehearsal tests pass.

* Remove module from setup.py that got added during the merge

* Bring back support for `update_with_oracle_cut_size` (#12086)

* Bring back support for `update_with_oracle_cut_size`

This option was available in the pre-refactor parser, but was never
implemented in the refactored parser. This option cuts transition
sequences that are longer than `update_with_oracle_cut` size into
separate sequences that have at most `update_with_oracle_cut`
transitions. The oracle (gold standard) transition sequence is used to
determine the cuts and the initial states for the additional sequences.

Applying this cut makes the batches more homogeneous in the transition
sequence lengths, making forward passes (and as a consequence training)
much faster.

Training time 1000 steps on de_core_news_lg:

- Before this change: 149s
- After this change: 68s
- Pre-refactor parser: 81s

* Fix a rename that was missed in #10878.

So that rehearsal tests pass.

* Apply suggestions from @shadeMe

* Use chained conditional

* Test with update_with_oracle_cut_size={0, 1, 5, 100}

And fix a git that occurs with a cut size of 1.

* Fix up some merge fall out

* Update parser distillation for the refactor

In the old parser, we'd iterate over the transitions in the distill
function and compute the loss/gradients on the go. In the refactored
parser, we first let the student model parse the inputs. Then we'll let
the teacher compute the transition probabilities of the states in the
student's transition sequence. We can then compute the gradients of the
student given the teacher.

* Add back spacy.TransitionBasedParser.v1 references

- Accordion in the architecture docs.
- Test in test_parse, but disabled until we have a spacy-legacy release.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: kadarakos <kadar.akos@gmail.com>
---
 setup.py                                      |   6 +-
 spacy/cli/templates/quickstart_training.jinja |  12 +-
 spacy/errors.py                               |   3 +
 spacy/ml/_precomputable_affine.py             | 164 -----
 spacy/ml/models/parser.py                     | 174 ++---
 spacy/ml/parser_model.pxd                     |  55 --
 spacy/ml/parser_model.pyx                     | 539 ---------------
 spacy/ml/tb_framework.pxd                     |  28 +
 spacy/ml/tb_framework.py                      |  51 --
 spacy/ml/tb_framework.pyx                     | 621 ++++++++++++++++++
 .../_parser_internals/_beam_utils.pyx         |   3 +-
 .../_parser_internals/_parser_utils.pxd       |   2 +
 .../_parser_internals/_parser_utils.pyx       |  22 +
 spacy/pipeline/_parser_internals/_state.pxd   |  59 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 +
 spacy/pipeline/_parser_internals/batch.pxd    |   2 +
 spacy/pipeline/_parser_internals/batch.pyx    |  52 ++
 spacy/pipeline/_parser_internals/ner.pyx      |   3 +
 .../pipeline/_parser_internals/stateclass.pyx |   7 +
 .../_parser_internals/transition_system.pxd   |   7 +
 .../_parser_internals/transition_system.pyx   |  71 ++
 .../{dep_parser.pyx => dep_parser.py}         |  17 +-
 spacy/pipeline/{ner.pyx => ner.py}            |  33 +-
 spacy/pipeline/transition_parser.pxd          |  31 -
 spacy/pipeline/transition_parser.pyx          | 509 ++++++--------
 spacy/tests/parser/test_ner.py                |  11 +-
 spacy/tests/parser/test_parse.py              |  80 ++-
 spacy/tests/pipeline/test_tok2vec.py          |   2 +-
 .../tests/serialize/test_serialize_config.py  |  56 +-
 spacy/tests/test_misc.py                      |  53 +-
 spacy/training/example.pyx                    |   1 -
 website/docs/api/architectures.mdx            |  26 +-
 website/docs/api/cli.mdx                      |   8 +-
 website/docs/api/legacy.mdx                   |   2 +-
 .../docs/usage/embeddings-transformers.mdx    |   6 +-
 35 files changed, 1293 insertions(+), 1426 deletions(-)
 delete mode 100644 spacy/ml/_precomputable_affine.py
 delete mode 100644 spacy/ml/parser_model.pxd
 delete mode 100644 spacy/ml/parser_model.pyx
 create mode 100644 spacy/ml/tb_framework.pxd
 delete mode 100644 spacy/ml/tb_framework.py
 create mode 100644 spacy/ml/tb_framework.pyx
 create mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pxd
 create mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pyx
 create mode 100644 spacy/pipeline/_parser_internals/batch.pxd
 create mode 100644 spacy/pipeline/_parser_internals/batch.pyx
 rename spacy/pipeline/{dep_parser.pyx => dep_parser.py} (97%)
 rename spacy/pipeline/{ner.pyx => ner.py} (93%)
 delete mode 100644 spacy/pipeline/transition_parser.pxd

diff --git a/setup.py b/setup.py
index 0eb529c2098..a4a87d68aec 100755
--- a/setup.py
+++ b/setup.py
@@ -32,12 +32,10 @@
     "spacy.kb.candidate",
     "spacy.kb.kb",
     "spacy.kb.kb_in_memory",
-    "spacy.ml.parser_model",
+    "spacy.ml.tb_framework",
     "spacy.morphology",
-    "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
-    "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
     "spacy.pipeline.sentencizer",
@@ -45,6 +43,7 @@
     "spacy.pipeline.tagger",
     "spacy.pipeline.transition_parser",
     "spacy.pipeline._parser_internals.arc_eager",
+    "spacy.pipeline._parser_internals.batch",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
     "spacy.pipeline._parser_internals.search",
@@ -52,6 +51,7 @@
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.pipeline._parser_internals._beam_utils",
+    "spacy.pipeline._parser_internals._parser_utils",
     "spacy.tokenizer",
     "spacy.training.align",
     "spacy.training.gold_io",
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 2817147f3e9..36325711d4d 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -90,12 +90,11 @@ grad_factor = 1.0
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = false
 nO = null
 
 [components.parser.model.tok2vec]
@@ -111,12 +110,11 @@ grad_factor = 1.0
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = false
 nO = null
 
 [components.ner.model.tok2vec]
@@ -387,12 +385,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = true
 nO = null
 
 [components.parser.model.tok2vec]
@@ -405,12 +402,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
diff --git a/spacy/errors.py b/spacy/errors.py
index 9bdb66006e5..9074a3fead8 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -215,6 +215,8 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
+    W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
+
 
 class Errors(metaclass=ErrorsWithCodes):
     E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
@@ -978,6 +980,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
     E4003 = ("Training examples for distillation must have the exact same tokens in the "
              "reference and predicted docs.")
+    E4004 = ("Backprop is not supported when is_train is not set.")
 
 
 # fmt: on
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
deleted file mode 100644
index 1c20c622b2c..00000000000
--- a/spacy/ml/_precomputable_affine.py
+++ /dev/null
@@ -1,164 +0,0 @@
-from thinc.api import Model, normal_init
-
-from ..util import registry
-
-
-@registry.layers("spacy.PrecomputableAffine.v1")
-def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
-    model = Model(
-        "precomputable_affine",
-        forward,
-        init=init,
-        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
-        params={"W": None, "b": None, "pad": None},
-        attrs={"dropout_rate": dropout},
-    )
-    return model
-
-
-def forward(model, X, is_train):
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.get_param("W")
-    # Preallocate array for layer output, including padding.
-    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
-    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
-    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
-
-    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
-    # change its shape to (nF, nO, nP) without breaking existing models. So
-    # we'll squeeze the first dimension here.
-    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
-
-    def backward(dY_ids):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nO, nP), and get back:
-        # (nB, nO, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nO, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        assert dY.ndim == 3
-        assert dY.shape[1] == nO, dY.shape
-        assert dY.shape[2] == nP, dY.shape
-        # nB = dY.shape[0]
-        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
-        Xf = X[ids]
-        Xf = Xf.reshape((Xf.shape[0], nF * nI))
-
-        model.inc_grad("b", dY.sum(axis=0))
-        dY = dY.reshape((dY.shape[0], nO * nP))
-
-        Wopfi = W.transpose((1, 2, 0, 3))
-        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
-        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
-
-        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
-        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
-        # (o, p, f, i) --> (f, o, p, i)
-        dWopfi = dWopfi.transpose((2, 0, 1, 3))
-        model.inc_grad("W", dWopfi)
-        return dXf.reshape((dXf.shape[0], nF, nI))
-
-    return Yf, backward
-
-
-def _backprop_precomputable_affine_padding(model, dY, ids):
-    nB = dY.shape[0]
-    nF = model.get_dim("nF")
-    nP = model.get_dim("nP")
-    nO = model.get_dim("nO")
-    # Backprop the "padding", used as a filler for missing values.
-    # Values that are missing are set to -1, and each state vector could
-    # have multiple missing values. The padding has different values for
-    # different missing features. The gradient of the padding vector is:
-    #
-    # for b in range(nB):
-    #     for f in range(nF):
-    #         if ids[b, f] < 0:
-    #             d_pad[f] += dY[b]
-    #
-    # Which can be rewritten as:
-    #
-    # (ids < 0).T @ dY
-    mask = model.ops.asarray(ids < 0, dtype="f")
-    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
-    return d_pad.reshape((1, nF, nO, nP))
-
-
-def init(model, X=None, Y=None):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    if model.has_param("W") and model.get_param("W").any():
-        return
-
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nO, nP, nI)
-    b = model.ops.alloc2f(nO, nP)
-    pad = model.ops.alloc4f(1, nF, nO, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
-        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors = vectors.reshape((vectors.shape[0], nO, nP))
-        vectors += b
-        vectors = model.ops.asarray(vectors)
-        if nP >= 2:
-            return model.ops.maxout(vectors)[0]
-        else:
-            return vectors * (vectors >= 0)
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = model.get_param("W").copy()
-    b = model.get_param("b").copy()
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("b", b)
-        else:
-            break
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index f6c0e565dd3..59483839206 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,18 +1,22 @@
-from typing import List, Optional, cast
-
-from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
+from typing import Optional, List, Tuple, Any
 from thinc.types import Floats2d
+from thinc.api import Model
+import warnings
 
+from ...errors import Errors, Warnings
 from ...compat import Literal
 from ...errors import Errors
 from ...tokens import Doc
 from ...util import registry
-from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
+from ...tokens.doc import Doc
 
+TransitionSystem = Any  # TODO
+State = Any  # TODO
 
-@registry.architectures("spacy.TransitionBasedParser.v2")
-def build_tb_parser_model(
+
+@registry.architectures.register("spacy.TransitionBasedParser.v2")
+def transition_parser_v2(
     tok2vec: Model[List[Doc], List[Floats2d]],
     state_type: Literal["parser", "ner"],
     extra_state_tokens: bool,
@@ -20,6 +24,46 @@ def build_tb_parser_model(
     maxout_pieces: int,
     use_upper: bool,
     nO: Optional[int] = None,
+) -> Model:
+    if not use_upper:
+        warnings.warn(Warnings.W400)
+
+    return build_tb_parser_model(
+        tok2vec,
+        state_type,
+        extra_state_tokens,
+        hidden_width,
+        maxout_pieces,
+        nO=nO,
+    )
+
+
+@registry.architectures.register("spacy.TransitionBasedParser.v3")
+def transition_parser_v3(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    state_type: Literal["parser", "ner"],
+    extra_state_tokens: bool,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+) -> Model:
+    return build_tb_parser_model(
+        tok2vec,
+        state_type,
+        extra_state_tokens,
+        hidden_width,
+        maxout_pieces,
+        nO=nO,
+    )
+
+
+def build_tb_parser_model(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    state_type: Literal["parser", "ner"],
+    extra_state_tokens: bool,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
 ) -> Model:
     """
     Build a transition-based parser model. Can apply to NER or dependency-parsing.
@@ -52,14 +96,7 @@ def build_tb_parser_model(
         feature sets (for the NER) or 13 (for the parser).
     hidden_width (int): The width of the hidden layer.
     maxout_pieces (int): How many pieces to use in the state prediction layer.
-        Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
-        is replaced with a ReLu non-linearity if use_upper=True, and no
-        non-linearity if use_upper=False.
-    use_upper (bool): Whether to use an additional hidden layer after the state
-        vector in order to predict the action scores. It is recommended to set
-        this to False for large pretrained models such as transformers, and True
-        for smaller networks. The upper layer is computed on CPU, which becomes
-        a bottleneck on larger GPU-based models, where it's also less necessary.
+        Recommended values are 1, 2 or 3.
     nO (int or None): The number of actions the model will predict between.
         Usually inferred from data at the beginning of training, or loaded from
         disk.
@@ -70,106 +107,11 @@ def build_tb_parser_model(
         nr_feature_tokens = 6 if extra_state_tokens else 3
     else:
         raise ValueError(Errors.E917.format(value=state_type))
-    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec = chain(
-        tok2vec,
-        list2array(),
-        Linear(hidden_width, t2v_width),
+    return TransitionModel(
+        tok2vec=tok2vec,
+        state_tokens=nr_feature_tokens,
+        hidden_width=hidden_width,
+        maxout_pieces=maxout_pieces,
+        nO=nO,
+        unseen_classes=set(),
     )
-    tok2vec.set_dim("nO", hidden_width)
-    lower = _define_lower(
-        nO=hidden_width if use_upper else nO,
-        nF=nr_feature_tokens,
-        nI=tok2vec.get_dim("nO"),
-        nP=maxout_pieces,
-    )
-    upper = None
-    if use_upper:
-        with use_ops("cpu"):
-            # Initialize weights at zero, as it's a classification layer.
-            upper = _define_upper(nO=nO, nI=None)
-    return TransitionModel(tok2vec, lower, upper, resize_output)
-
-
-def _define_upper(nO, nI):
-    return Linear(nO=nO, nI=nI, init_W=zero_init)
-
-
-def _define_lower(nO, nF, nI, nP):
-    return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
-
-
-def resize_output(model, new_nO):
-    if model.attrs["has_upper"]:
-        return _resize_upper(model, new_nO)
-    return _resize_lower(model, new_nO)
-
-
-def _resize_upper(model, new_nO):
-    upper = model.get_ref("upper")
-    if upper.has_dim("nO") is None:
-        upper.set_dim("nO", new_nO)
-        return model
-    elif new_nO == upper.get_dim("nO"):
-        return model
-
-    smaller = upper
-    nI = smaller.maybe_get_dim("nI")
-    with use_ops("cpu"):
-        larger = _define_upper(nO=new_nO, nI=nI)
-    # it could be that the model is not initialized yet, then skip this bit
-    if smaller.has_param("W"):
-        larger_W = larger.ops.alloc2f(new_nO, nI)
-        larger_b = larger.ops.alloc1f(new_nO)
-        smaller_W = smaller.get_param("W")
-        smaller_b = smaller.get_param("b")
-        # Weights are stored in (nr_out, nr_in) format, so we're basically
-        # just adding rows here.
-        if smaller.has_dim("nO"):
-            old_nO = smaller.get_dim("nO")
-            larger_W[:old_nO] = smaller_W
-            larger_b[:old_nO] = smaller_b
-            for i in range(old_nO, new_nO):
-                model.attrs["unseen_classes"].add(i)
-
-        larger.set_param("W", larger_W)
-        larger.set_param("b", larger_b)
-    model._layers[-1] = larger
-    model.set_ref("upper", larger)
-    return model
-
-
-def _resize_lower(model, new_nO):
-    lower = model.get_ref("lower")
-    if lower.has_dim("nO") is None:
-        lower.set_dim("nO", new_nO)
-        return model
-
-    smaller = lower
-    nI = smaller.maybe_get_dim("nI")
-    nF = smaller.maybe_get_dim("nF")
-    nP = smaller.maybe_get_dim("nP")
-    larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
-    # it could be that the model is not initialized yet, then skip this bit
-    if smaller.has_param("W"):
-        larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
-        larger_b = larger.ops.alloc2f(new_nO, nP)
-        larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
-        smaller_W = smaller.get_param("W")
-        smaller_b = smaller.get_param("b")
-        smaller_pad = smaller.get_param("pad")
-        # Copy the old weights and padding into the new layer
-        if smaller.has_dim("nO"):
-            old_nO = smaller.get_dim("nO")
-            larger_W[:, 0:old_nO, :, :] = smaller_W
-            larger_pad[:, :, 0:old_nO, :] = smaller_pad
-            larger_b[0:old_nO, :] = smaller_b
-            for i in range(old_nO, new_nO):
-                model.attrs["unseen_classes"].add(i)
-
-        larger.set_param("W", larger_W)
-        larger.set_param("b", larger_b)
-        larger.set_param("pad", larger_pad)
-    model._layers[1] = larger
-    model.set_ref("lower", larger)
-    return model
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
deleted file mode 100644
index 4d2d7b3feeb..00000000000
--- a/spacy/ml/parser_model.pxd
+++ /dev/null
@@ -1,55 +0,0 @@
-from libc.string cimport memcpy, memset
-from thinc.backends.cblas cimport CBlas
-
-from ..pipeline._parser_internals._state cimport StateC
-from ..typedefs cimport hash_t, weight_t
-
-
-cdef struct SizesC:
-    int states
-    int classes
-    int hiddens
-    int pieces
-    int feats
-    int embed_width
-
-
-cdef struct WeightsC:
-    const float* feat_weights
-    const float* feat_bias
-    const float* hidden_bias
-    const float* hidden_weights
-    const float* seen_classes
-
-
-cdef struct ActivationsC:
-    int* token_ids
-    float* unmaxed
-    float* scores
-    float* hiddens
-    int* is_valid
-    int _curr_size
-    int _max_size
-
-
-cdef WeightsC get_c_weights(model) except *
-
-cdef SizesC get_c_sizes(model, int batch_size) except *
-
-cdef ActivationsC alloc_activations(SizesC n) nogil
-
-cdef void free_activations(const ActivationsC* A) nogil
-
-cdef void predict_states(
-    CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
-) nogil
-
-cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
-
-cdef void cpu_log_loss(
-    float* d_scores,
-    const float* costs,
-    const int* is_valid,
-    const float* scores,
-    int O
-) nogil
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
deleted file mode 100644
index 10a9f0bc485..00000000000
--- a/spacy/ml/parser_model.pyx
+++ /dev/null
@@ -1,539 +0,0 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
-# cython: profile=False
-cimport numpy as np
-from libc.math cimport exp
-from libc.stdlib cimport calloc, free, realloc
-from thinc.backends.cblas cimport saxpy, sgemm
-from thinc.backends.linalg cimport Vec, VecVec
-
-import numpy
-import numpy.random
-from thinc.api import CupyOps, Model, NumpyOps
-
-from .. import util
-from ..errors import Errors
-
-from ..pipeline._parser_internals.stateclass cimport StateClass
-from ..typedefs cimport weight_t
-
-
-cdef WeightsC get_c_weights(model) except *:
-    cdef WeightsC output
-    cdef precompute_hiddens state2vec = model.state2vec
-    output.feat_weights = state2vec.get_feat_weights()
-    output.feat_bias = <const float*>state2vec.bias.data
-    cdef np.ndarray vec2scores_W
-    cdef np.ndarray vec2scores_b
-    if model.vec2scores is None:
-        output.hidden_weights = NULL
-        output.hidden_bias = NULL
-    else:
-        vec2scores_W = model.vec2scores.get_param("W")
-        vec2scores_b = model.vec2scores.get_param("b")
-        output.hidden_weights = <const float*>vec2scores_W.data
-        output.hidden_bias = <const float*>vec2scores_b.data
-    cdef np.ndarray class_mask = model._class_mask
-    output.seen_classes = <const float*>class_mask.data
-    return output
-
-
-cdef SizesC get_c_sizes(model, int batch_size) except *:
-    cdef SizesC output
-    output.states = batch_size
-    if model.vec2scores is None:
-        output.classes = model.state2vec.get_dim("nO")
-    else:
-        output.classes = model.vec2scores.get_dim("nO")
-    output.hiddens = model.state2vec.get_dim("nO")
-    output.pieces = model.state2vec.get_dim("nP")
-    output.feats = model.state2vec.get_dim("nF")
-    output.embed_width = model.tokvecs.shape[1]
-    return output
-
-
-cdef ActivationsC alloc_activations(SizesC n) nogil:
-    cdef ActivationsC A
-    memset(&A, 0, sizeof(A))
-    resize_activations(&A, n)
-    return A
-
-
-cdef void free_activations(const ActivationsC* A) nogil:
-    free(A.token_ids)
-    free(A.scores)
-    free(A.unmaxed)
-    free(A.hiddens)
-    free(A.is_valid)
-
-
-cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
-    if n.states <= A._max_size:
-        A._curr_size = n.states
-        return
-    if A._max_size == 0:
-        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
-        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
-        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
-        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    else:
-        A.token_ids = <int*>realloc(
-            A.token_ids, n.states * n.feats * sizeof(A.token_ids[0])
-        )
-        A.scores = <float*>realloc(
-            A.scores, n.states * n.classes * sizeof(A.scores[0])
-        )
-        A.unmaxed = <float*>realloc(
-            A.unmaxed, n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])
-        )
-        A.hiddens = <float*>realloc(
-            A.hiddens, n.states * n.hiddens * sizeof(A.hiddens[0])
-        )
-        A.is_valid = <int*>realloc(
-            A.is_valid, n.states * n.classes * sizeof(A.is_valid[0])
-        )
-        A._max_size = n.states
-    A._curr_size = n.states
-
-
-cdef void predict_states(
-    CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
-) nogil:
-    resize_activations(A, n)
-    for i in range(n.states):
-        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
-    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
-    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
-    sum_state_features(
-        cblas,
-        A.unmaxed,
-        W.feat_weights,
-        A.token_ids,
-        n.states,
-        n.feats,
-        n.hiddens * n.pieces
-    )
-    for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
-        for j in range(n.hiddens):
-            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = _arg_max(&A.unmaxed[index], n.pieces)
-            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
-    memset(A.scores, 0, n.states * n.classes * sizeof(float))
-    if W.hidden_weights == NULL:
-        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
-    else:
-        # Compute hidden-to-output
-        sgemm(cblas)(
-            False, True, n.states, n.classes, n.hiddens,
-            1.0, <const float *>A.hiddens, n.hiddens,
-            <const float *>W.hidden_weights, n.hiddens,
-            0.0, A.scores, n.classes
-        )
-        # Add bias
-        for i in range(n.states):
-            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
-    # Set unseen classes to minimum value
-    i = 0
-    min_ = A.scores[0]
-    for i in range(1, n.states * n.classes):
-        if A.scores[i] < min_:
-            min_ = A.scores[i]
-    for i in range(n.states):
-        for j in range(n.classes):
-            if not W.seen_classes[j]:
-                A.scores[i*n.classes+j] = min_
-
-
-cdef void sum_state_features(
-    CBlas cblas,
-    float* output,
-    const float* cached,
-    const int* token_ids,
-    int B,
-    int F,
-    int O
-) nogil:
-    cdef int idx, b, f
-    cdef const float* feature
-    padding = cached
-    cached += F * O
-    cdef int id_stride = F*O
-    cdef float one = 1.
-    for b in range(B):
-        for f in range(F):
-            if token_ids[f] < 0:
-                feature = &padding[f*O]
-            else:
-                idx = token_ids[f] * id_stride + f*O
-                feature = &cached[idx]
-            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
-        token_ids += F
-
-
-cdef void cpu_log_loss(
-    float* d_scores,
-    const float* costs,
-    const int* is_valid,
-    const float* scores,
-    int O
-) nogil:
-    """Do multi-label log loss"""
-    cdef double max_, gmax, Z, gZ
-    best = arg_max_if_gold(scores, costs, is_valid, O)
-    guess = _arg_max(scores, O)
-
-    if best == -1 or guess == -1:
-        # These shouldn't happen, but if they do, we want to make sure we don't
-        # cause an OOB access.
-        return
-    Z = 1e-10
-    gZ = 1e-10
-    max_ = scores[guess]
-    gmax = scores[best]
-    for i in range(O):
-        Z += exp(scores[i] - max_)
-        if costs[i] <= costs[best]:
-            gZ += exp(scores[i] - gmax)
-    for i in range(O):
-        if costs[i] <= costs[best]:
-            d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
-        else:
-            d_scores[i] = exp(scores[i]-max_) / Z
-
-
-cdef int arg_max_if_gold(
-    const weight_t* scores, const weight_t* costs, const int* is_valid, int n
-) nogil:
-    # Find minimum cost
-    cdef float cost = 1
-    for i in range(n):
-        if is_valid[i] and costs[i] < cost:
-            cost = costs[i]
-    # Now find best-scoring with that cost
-    cdef int best = -1
-    for i in range(n):
-        if costs[i] <= cost and is_valid[i]:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
-
-
-cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
-    cdef int best = -1
-    for i in range(n):
-        if is_valid[i] >= 1:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
-
-
-class ParserStepModel(Model):
-    def __init__(
-        self,
-        docs,
-        layers,
-        *,
-        has_upper,
-        unseen_classes=None,
-        train=True,
-        dropout=0.1
-    ):
-        Model.__init__(self, name="parser_step_model", forward=step_forward)
-        self.attrs["has_upper"] = has_upper
-        self.attrs["dropout_rate"] = dropout
-        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
-        if layers[1].get_dim("nP") >= 2:
-            activation = "maxout"
-        elif has_upper:
-            activation = None
-        else:
-            activation = "relu"
-        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
-                                            activation=activation, train=train)
-        if has_upper:
-            self.vec2scores = layers[-1]
-        else:
-            self.vec2scores = None
-        self.cuda_stream = util.get_cuda_stream(non_blocking=True)
-        self.backprops = []
-        self._class_mask = numpy.zeros((self.nO,), dtype='f')
-        self._class_mask.fill(1)
-        if unseen_classes is not None:
-            for class_ in unseen_classes:
-                self._class_mask[class_] = 0.
-
-    def clear_memory(self):
-        del self.tokvecs
-        del self.bp_tokvecs
-        del self.state2vec
-        del self.backprops
-        del self._class_mask
-
-    @property
-    def nO(self):
-        if self.attrs["has_upper"]:
-            return self.vec2scores.get_dim("nO")
-        else:
-            return self.state2vec.get_dim("nO")
-
-    def class_is_unseen(self, class_):
-        return self._class_mask[class_]
-
-    def mark_class_unseen(self, class_):
-        self._class_mask[class_] = 0
-
-    def mark_class_seen(self, class_):
-        self._class_mask[class_] = 1
-
-    def get_token_ids(self, states):
-        cdef StateClass state
-        states = [state for state in states if not state.is_final()]
-        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
-                                          dtype='i', order='C')
-        ids.fill(-1)
-        c_ids = <int*>ids.data
-        for state in states:
-            state.c.set_context_tokens(c_ids, ids.shape[1])
-            c_ids += ids.shape[1]
-        return ids
-
-    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
-        if (
-            isinstance(self.state2vec.ops, CupyOps)
-            and not isinstance(token_ids, self.state2vec.ops.xp.ndarray)
-        ):
-            # Move token_ids and d_vector to GPU, asynchronously
-            self.backprops.append((
-                util.get_async(self.cuda_stream, token_ids),
-                util.get_async(self.cuda_stream, d_vector),
-                get_d_tokvecs
-            ))
-        else:
-            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
-
-    def finish_steps(self, golds):
-        # Add a padding vector to the d_tokvecs gradient, so that missing
-        # values don't affect the real gradient.
-        d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
-        # Tells CUDA to block, so our async copies complete.
-        if self.cuda_stream is not None:
-            self.cuda_stream.synchronize()
-        for ids, d_vector, bp_vector in self.backprops:
-            d_state_features = bp_vector((d_vector, ids))
-            ids = ids.flatten()
-            d_state_features = d_state_features.reshape(
-                (ids.size, d_state_features.shape[2]))
-            self.ops.scatter_add(d_tokvecs, ids, d_state_features)
-        # Padded -- see update()
-        self.bp_tokvecs(d_tokvecs[:-1])
-        return d_tokvecs
-
-
-NUMPY_OPS = NumpyOps()
-
-
-def step_forward(model: ParserStepModel, states, is_train):
-    token_ids = model.get_token_ids(states)
-    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
-    mask = None
-    if model.attrs["has_upper"]:
-        dropout_rate = model.attrs["dropout_rate"]
-        if is_train and dropout_rate > 0:
-            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
-            vector *= mask
-        scores, get_d_vector = model.vec2scores(vector, is_train)
-    else:
-        scores = NumpyOps().asarray(vector)
-        get_d_vector = lambda d_scores: d_scores  # no-cython-lint: E731
-    # If the class is unseen, make sure its score is minimum
-    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
-
-    def backprop_parser_step(d_scores):
-        # Zero vectors for unseen classes
-        d_scores *= model._class_mask
-        d_vector = get_d_vector(d_scores)
-        if mask is not None:
-            d_vector *= mask
-        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
-        return None
-    return scores, backprop_parser_step
-
-
-cdef class precompute_hiddens:
-    """Allow a model to be "primed" by pre-computing input features in bulk.
-
-    This is used for the parser, where we want to take a batch of documents,
-    and compute vectors for each (token, position) pair. These vectors can then
-    be reused, especially for beam-search.
-
-    Let's say we're using 12 features for each state, e.g. word at start of
-    buffer, three words on stack, their children, etc. In the normal arc-eager
-    system, a document of length N is processed in 2*N states. This means we'll
-    create 2*N*12 feature vectors --- but if we pre-compute, we only need
-    N*12 vector computations. The saving for beam-search is much better:
-    if we have a beam of k, we'll normally make 2*N*12*K computations --
-    so we can save the factor k. This also gives a nice CPU/GPU division:
-    we can do all our hard maths up front, packed into large multiplications,
-    and do the hard-to-program parsing on the CPU.
-    """
-    cdef readonly int nF, nO, nP
-    cdef bint _is_synchronized
-    cdef public object ops
-    cdef public object numpy_ops
-    cdef public object _cpu_ops
-    cdef np.ndarray _features
-    cdef np.ndarray _cached
-    cdef np.ndarray bias
-    cdef object _cuda_stream
-    cdef object _bp_hiddens
-    cdef object activation
-
-    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
-                 activation="maxout", train=False):
-        gpu_cached, bp_features = lower_model(tokvecs, train)
-        cdef np.ndarray cached
-        if not isinstance(gpu_cached, numpy.ndarray):
-            # Note the passing of cuda_stream here: it lets
-            # cupy make the copy asynchronously.
-            # We then have to block before first use.
-            cached = gpu_cached.get(stream=cuda_stream)
-        else:
-            cached = gpu_cached
-        if not isinstance(lower_model.get_param("b"), numpy.ndarray):
-            self.bias = lower_model.get_param("b").get(stream=cuda_stream)
-        else:
-            self.bias = lower_model.get_param("b")
-        self.nF = cached.shape[1]
-        if lower_model.has_dim("nP"):
-            self.nP = lower_model.get_dim("nP")
-        else:
-            self.nP = 1
-        self.nO = cached.shape[2]
-        self.ops = lower_model.ops
-        self.numpy_ops = NumpyOps()
-        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
-        assert activation in (None, "relu", "maxout")
-        self.activation = activation
-        self._is_synchronized = False
-        self._cuda_stream = cuda_stream
-        self._cached = cached
-        self._bp_hiddens = bp_features
-
-    cdef const float* get_feat_weights(self) except NULL:
-        if not self._is_synchronized and self._cuda_stream is not None:
-            self._cuda_stream.synchronize()
-            self._is_synchronized = True
-        return <float*>self._cached.data
-
-    def has_dim(self, name):
-        if name == "nF":
-            return self.nF if self.nF is not None else True
-        elif name == "nP":
-            return self.nP if self.nP is not None else True
-        elif name == "nO":
-            return self.nO if self.nO is not None else True
-        else:
-            return False
-
-    def get_dim(self, name):
-        if name == "nF":
-            return self.nF
-        elif name == "nP":
-            return self.nP
-        elif name == "nO":
-            return self.nO
-        else:
-            raise ValueError(Errors.E1033.format(name=name))
-
-    def set_dim(self, name, value):
-        if name == "nF":
-            self.nF = value
-        elif name == "nP":
-            self.nP = value
-        elif name == "nO":
-            self.nO = value
-        else:
-            raise ValueError(Errors.E1033.format(name=name))
-
-    def __call__(self, X, bint is_train):
-        if is_train:
-            return self.begin_update(X)
-        else:
-            return self.predict(X), lambda X: X
-
-    def predict(self, X):
-        return self.begin_update(X)[0]
-
-    def begin_update(self, token_ids):
-        cdef np.ndarray state_vector = numpy.zeros(
-            (token_ids.shape[0], self.nO, self.nP), dtype='f')
-        # This is tricky, but (assuming GPU available);
-        # - Input to forward on CPU
-        # - Output from forward on CPU
-        # - Input to backward on GPU!
-        # - Output from backward on GPU
-        bp_hiddens = self._bp_hiddens
-
-        cdef CBlas cblas = self._cpu_ops.cblas()
-
-        feat_weights = self.get_feat_weights()
-        cdef int[:, ::1] ids = token_ids
-        sum_state_features(
-            cblas, <float*>state_vector.data,
-            feat_weights, &ids[0, 0],
-            token_ids.shape[0], self.nF, self.nO*self.nP
-        )
-        state_vector += self.bias
-        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
-
-        def backward(d_state_vector_ids):
-            d_state_vector, token_ids = d_state_vector_ids
-            d_state_vector = bp_nonlinearity(d_state_vector)
-            d_tokens = bp_hiddens((d_state_vector, token_ids))
-            return d_tokens
-        return state_vector, backward
-
-    def _nonlinearity(self, state_vector):
-        if self.activation == "maxout":
-            return self._maxout_nonlinearity(state_vector)
-        else:
-            return self._relu_nonlinearity(state_vector)
-
-    def _maxout_nonlinearity(self, state_vector):
-        state_vector, mask = self.numpy_ops.maxout(state_vector)
-        # We're outputting to CPU, but we need this variable on GPU for the
-        # backward pass.
-        mask = self.ops.asarray(mask)
-
-        def backprop_maxout(d_best):
-            return self.ops.backprop_maxout(d_best, mask, self.nP)
-
-        return state_vector, backprop_maxout
-
-    def _relu_nonlinearity(self, state_vector):
-        state_vector = state_vector.reshape((state_vector.shape[0], -1))
-        mask = state_vector >= 0.
-        state_vector *= mask
-        # We're outputting to CPU, but we need this variable on GPU for the
-        # backward pass.
-        mask = self.ops.asarray(mask)
-
-        def backprop_relu(d_best):
-            d_best *= mask
-            return d_best.reshape((d_best.shape + (1,)))
-
-        return state_vector, backprop_relu
-
-cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
-    if n_classes == 2:
-        return 0 if scores[0] > scores[1] else 1
-    cdef int i
-    cdef int best = 0
-    cdef float mode = scores[0]
-    for i in range(1, n_classes):
-        if scores[i] > mode:
-            mode = scores[i]
-            best = i
-    return best
diff --git a/spacy/ml/tb_framework.pxd b/spacy/ml/tb_framework.pxd
new file mode 100644
index 00000000000..965508519e8
--- /dev/null
+++ b/spacy/ml/tb_framework.pxd
@@ -0,0 +1,28 @@
+from libc.stdint cimport int8_t
+
+
+cdef struct SizesC:
+    int states
+    int classes
+    int hiddens
+    int pieces
+    int feats
+    int embed_width
+    int tokens
+
+
+cdef struct WeightsC:
+    const float* feat_weights
+    const float* feat_bias
+    const float* hidden_bias
+    const float* hidden_weights
+    const int8_t* seen_mask
+
+
+cdef struct ActivationsC:
+    int* token_ids
+    float* unmaxed
+    float* hiddens
+    int* is_valid
+    int _curr_size
+    int _max_size
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
deleted file mode 100644
index e351ad4e570..00000000000
--- a/spacy/ml/tb_framework.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from thinc.api import Model, noop
-
-from ..util import registry
-from .parser_model import ParserStepModel
-
-
-@registry.layers("spacy.TransitionModel.v1")
-def TransitionModel(
-    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
-):
-    """Set up a stepwise transition-based model"""
-    if upper is None:
-        has_upper = False
-        upper = noop()
-    else:
-        has_upper = True
-    # don't define nO for this object, because we can't dynamically change it
-    return Model(
-        name="parser_model",
-        forward=forward,
-        dims={"nI": tok2vec.maybe_get_dim("nI")},
-        layers=[tok2vec, lower, upper],
-        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
-        init=init,
-        attrs={
-            "has_upper": has_upper,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def forward(model, X, is_train):
-    step_model = ParserStepModel(
-        X,
-        model.layers,
-        unseen_classes=model.attrs["unseen_classes"],
-        train=is_train,
-        has_upper=model.attrs["has_upper"],
-    )
-
-    return step_model, step_model.finish_steps
-
-
-def init(model, X=None, Y=None):
-    model.get_ref("tok2vec").initialize(X=X)
-    lower = model.get_ref("lower")
-    lower.initialize()
-    if model.attrs["has_upper"]:
-        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
-        model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
new file mode 100644
index 00000000000..79be13b00bd
--- /dev/null
+++ b/spacy/ml/tb_framework.pyx
@@ -0,0 +1,621 @@
+# cython: infer_types=True, cdivision=True, boundscheck=False
+from typing import List, Tuple, Any, Optional, TypeVar, cast
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from libcpp.vector cimport vector
+import numpy
+cimport numpy as np
+from thinc.api import Model, normal_init, chain, list2array, Linear
+from thinc.api import uniform_init, glorot_uniform_init, zero_init
+from thinc.api import NumpyOps
+from thinc.backends.cblas cimport CBlas, saxpy, sgemm
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
+from thinc.types import Ints1d, Ints2d
+
+from ..errors import Errors
+from ..pipeline._parser_internals import _beam_utils
+from ..pipeline._parser_internals.batch import GreedyBatch
+from ..pipeline._parser_internals._parser_utils cimport arg_max
+from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
+from ..pipeline._parser_internals.transition_system cimport TransitionSystem
+from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..tokens.doc import Doc
+from ..util import registry
+
+
+State = Any  # TODO
+
+
+@registry.layers("spacy.TransitionModel.v2")
+def TransitionModel(
+    *,
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    beam_width: int = 1,
+    beam_density: float = 0.0,
+    state_tokens: int,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+    unseen_classes=set(),
+) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
+    """Set up a transition-based parsing model, using a maxout hidden
+    layer and a linear output layer.
+    """
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
+    tok2vec_projected.set_dim("nO", hidden_width)
+
+    # FIXME: we use `output` as a container for the output layer's
+    # weights and biases. Thinc optimizers cannot handle resizing
+    # of parameters. So, when the parser model is resized, we
+    # construct a new `output` layer, which has a different key in
+    # the optimizer. Once the optimizer supports parameter resizing,
+    # we can replace the `output` layer by `output_W` and `output_b`
+    # parameters in this model.
+    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
+
+    return Model(
+        name="parser_model",
+        forward=forward,
+        init=init,
+        layers=[tok2vec_projected, output],
+        refs={
+            "tok2vec": tok2vec_projected,
+            "output": output,
+        },
+        params={
+            "hidden_W": None,  # Floats2d W for the hidden layer
+            "hidden_b": None,  # Floats1d bias for the hidden layer
+            "hidden_pad": None,  # Floats1d padding for the hidden layer
+        },
+        dims={
+            "nO": None,  # Output size
+            "nP": maxout_pieces,
+            "nH": hidden_width,
+            "nI": tok2vec_projected.maybe_get_dim("nO"),
+            "nF": state_tokens,
+        },
+        attrs={
+            "beam_width": beam_width,
+            "beam_density": beam_density,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def resize_output(model: Model, new_nO: int) -> Model:
+    old_nO = model.maybe_get_dim("nO")
+    output = model.get_ref("output")
+    if old_nO is None:
+        model.set_dim("nO", new_nO)
+        output.set_dim("nO", new_nO)
+        output.initialize()
+        return model
+    elif new_nO <= old_nO:
+        return model
+    elif output.has_param("W"):
+        nH = model.get_dim("nH")
+        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
+        new_output.initialize()
+        new_W = new_output.get_param("W")
+        new_b = new_output.get_param("b")
+        old_W = output.get_param("W")
+        old_b = output.get_param("b")
+        new_W[:old_nO] = old_W  # type: ignore
+        new_b[:old_nO] = old_b  # type: ignore
+        for i in range(old_nO, new_nO):
+            model.attrs["unseen_classes"].add(i)
+        model.layers[-1] = new_output
+        model.set_ref("output", new_output)
+    # TODO: Avoid this private intrusion
+    model._dims["nO"] = new_nO
+    return model
+
+
+def init(
+    model,
+    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
+    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
+):
+    if X is not None:
+        docs, moves = X
+        model.get_ref("tok2vec").initialize(X=docs)
+    else:
+        model.get_ref("tok2vec").initialize()
+    inferred_nO = _infer_nO(Y)
+    if inferred_nO is not None:
+        current_nO = model.maybe_get_dim("nO")
+        if current_nO is None or current_nO != inferred_nO:
+            model.attrs["resize_output"](model, inferred_nO)
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nH = model.get_dim("nH")
+    nI = model.get_dim("nI")
+    nF = model.get_dim("nF")
+    ops = model.ops
+
+    Wl = ops.alloc2f(nH * nP, nF * nI)
+    bl = ops.alloc1f(nH * nP)
+    padl = ops.alloc1f(nI)
+    # Wl = zero_init(ops, Wl.shape)
+    Wl = glorot_uniform_init(ops, Wl.shape)
+    padl = uniform_init(ops, padl.shape)  # type: ignore
+    # TODO: Experiment with whether better to initialize output_W
+    model.set_param("hidden_W", Wl)
+    model.set_param("hidden_b", bl)
+    model.set_param("hidden_pad", padl)
+    # model = _lsuv_init(model)
+    return model
+
+
+class TransitionModelInputs:
+    """
+    Input to transition model.
+    """
+
+    # dataclass annotation is not yet supported in Cython 0.29.x,
+    # so, we'll do something close to it.
+
+    actions: Optional[List[Ints1d]]
+    docs: List[Doc]
+    max_moves: int
+    moves: TransitionSystem
+    states: Optional[List[State]]
+
+    __slots__ = [
+        "actions",
+        "docs",
+        "max_moves",
+        "moves",
+        "states",
+    ]
+
+    def __init__(
+        self,
+        docs: List[Doc],
+        moves: TransitionSystem,
+        actions: Optional[List[Ints1d]]=None,
+        max_moves: int=0,
+        states: Optional[List[State]]=None):
+        """
+        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
+        docs (List[Doc]): Docs to predict transition sequences for.
+        max_moves: (int): the maximum number of moves to apply, values less
+            than 1 will apply moves to states until they are final states.
+        moves (TransitionSystem): the transition system to use when predicting
+            the transition sequences.
+        states (Optional[List[States]]): the initial states to predict the
+            transition sequences for. When absent, the initial states are
+            initialized from the provided Docs.
+        """
+        self.actions = actions
+        self.docs = docs
+        self.moves = moves
+        self.max_moves = max_moves
+        self.states = states
+
+
+def forward(model, inputs: TransitionModelInputs, is_train: bool):
+    docs = inputs.docs
+    moves = inputs.moves
+    actions = inputs.actions
+
+    beam_width = model.attrs["beam_width"]
+    hidden_pad = model.get_param("hidden_pad")
+    tok2vec = model.get_ref("tok2vec")
+
+    states = moves.init_batch(docs) if inputs.states is None else inputs.states
+    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
+    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
+    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
+    seen_mask = _get_seen_mask(model)
+
+    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
+        # Note: max_moves is only used during training, so we don't need to
+        #       pass it to the greedy inference path.
+        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
+    else:
+        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
+            feats, backprop_feats, seen_mask, is_train, actions=actions,
+            max_moves=inputs.max_moves)
+
+
+def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
+                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+    cdef vector[StateC*] c_states
+    cdef StateClass state
+    for state in states:
+        if not state.is_final():
+            c_states.push_back(state.c)
+    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
+    # Precomputed features have rows for each token, plus one for padding.
+    cdef int n_tokens = feats.shape[0] - 1
+    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
+    cdef CBlas cblas = model.ops.cblas()
+    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
+
+    def backprop(dY):
+        raise ValueError(Errors.E4004)
+
+    return (states, scores), backprop
+
+cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
+                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
+    cdef int i, j
+    cdef vector[StateC *] unfinished
+    cdef ActivationsC activations = _alloc_activations(sizes)
+    cdef np.ndarray step_scores
+    cdef np.ndarray step_actions
+
+    scores = []
+    while sizes.states >= 1:
+        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
+        step_actions = actions[0] if actions is not None else None
+        with nogil:
+            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
+            if actions is None:
+                # Validate actions, argmax, take action.
+                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
+                    sizes.states)
+            else:
+                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
+            for i in range(sizes.states):
+                if not states[i].is_final():
+                    unfinished.push_back(states[i])
+            for i in range(unfinished.size()):
+                states[i] = unfinished[i]
+        sizes.states = unfinished.size()
+        scores.append(step_scores)
+        unfinished.clear()
+        actions = actions[1:] if actions is not None else None
+    _free_activations(&activations)
+
+    return scores
+
+
+def _forward_fallback(
+    model: Model,
+    moves: TransitionSystem,
+    states: List[StateClass],
+    tokvecs, backprop_tok2vec,
+    feats,
+    backprop_feats,
+    seen_mask,
+    is_train: bool,
+    actions: Optional[List[Ints1d]]=None,
+    max_moves: int=0):
+    nF = model.get_dim("nF")
+    output = model.get_ref("output")
+    hidden_b = model.get_param("hidden_b")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+
+    beam_width = model.attrs["beam_width"]
+    beam_density = model.attrs["beam_density"]
+
+    ops = model.ops
+
+    all_ids = []
+    all_which = []
+    all_statevecs = []
+    all_scores = []
+    if beam_width == 1:
+        batch = GreedyBatch(moves, states, None)
+    else:
+        batch = _beam_utils.BeamBatch(
+            moves, states, None, width=beam_width, density=beam_density
+        )
+    arange = ops.xp.arange(nF)
+    n_moves = 0
+    while not batch.is_done:
+        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
+        for i, state in enumerate(batch.get_unfinished_states()):
+            state.set_context_tokens(ids, i, nF)
+        # Sum the state features, add the bias and apply the activation (maxout)
+        # to create the state vectors.
+        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
+        preacts2f += hidden_b
+        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
+        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
+        statevecs, which = ops.maxout(preacts)
+        # We don't use output's backprop, since we want to backprop for
+        # all states at once, rather than a single state.
+        scores = output.predict(statevecs)
+        scores[:, seen_mask] = ops.xp.nanmin(scores)
+        # Transition the states, filtering out any that are finished.
+        cpu_scores = ops.to_numpy(scores)
+        if actions is None:
+            batch.advance(cpu_scores)
+        else:
+            batch.advance_with_actions(actions[0])
+            actions = actions[1:]
+        all_scores.append(scores)
+        if is_train:
+            # Remember intermediate results for the backprop.
+            all_ids.append(ids)
+            all_statevecs.append(statevecs)
+            all_which.append(which)
+        if n_moves >= max_moves >= 1:
+            break
+        n_moves += 1
+
+    def backprop_parser(d_states_d_scores):
+        ids = ops.xp.vstack(all_ids)
+        which = ops.xp.vstack(all_which)
+        statevecs = ops.xp.vstack(all_statevecs)
+        _, d_scores = d_states_d_scores
+        if model.attrs.get("unseen_classes"):
+            # If we have a negative gradient (i.e. the probability should
+            # increase) on any classes we filtered out as unseen, mark
+            # them as seen.
+            for clas in set(model.attrs["unseen_classes"]):
+                if (d_scores[:, clas] < 0).any():
+                    model.attrs["unseen_classes"].remove(clas)
+        d_scores *= seen_mask == False
+        # Calculate the gradients for the parameters of the output layer.
+        # The weight gemm is (nS, nO) @ (nS, nH).T
+        output.inc_grad("b", d_scores.sum(axis=0))
+        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
+        # Now calculate d_statevecs, by backproping through the output linear layer.
+        # This gemm is (nS, nO) @ (nO, nH)
+        output_W = output.get_param("W")
+        d_statevecs = ops.gemm(d_scores, output_W)
+        # Backprop through the maxout activation
+        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
+        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
+        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
+        # We don't need to backprop the summation, because we pass back the IDs instead
+        d_state_features = backprop_feats((d_preacts2f, ids))
+        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
+        ops.scatter_add(d_tokvecs, ids, d_state_features)
+        model.inc_grad("hidden_pad", d_tokvecs[-1])
+        return (backprop_tok2vec(d_tokvecs[:-1]), None)
+
+    return (list(batch), all_scores), backprop_parser
+
+
+def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
+    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
+    for class_ in model.attrs.get("unseen_classes", set()):
+        mask[class_] = True
+    return mask
+
+
+def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
+    W: Floats2d = model.get_param("hidden_W")
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
+    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
+    W3f = W3f.transpose((1, 0, 2))
+    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
+    assert X.shape == (X.shape[0], nI), X.shape
+    Yf_ = model.ops.gemm(X, W2f, trans2=True)
+    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
+
+    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nH, nP), and get back:
+        # (nB, nH, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nH, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        dXf = model.ops.gemm(dY, W)
+        Xf = X[ids].reshape((ids.shape[0], -1))
+        dW = model.ops.gemm(dY, Xf, trans1=True)
+        model.inc_grad("hidden_W", dW)
+        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
+
+    return Yf, backward
+
+
+def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
+    if Y is None:
+        return None
+    _, scores = Y
+    if len(scores) == 0:
+        return None
+    assert scores[0].shape[0] >= 1
+    assert len(scores[0].shape) == 2
+    return scores[0].shape[1]
+
+
+def _lsuv_init(model: Model):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    W = model.maybe_get_param("hidden_W")
+    if W is not None and W.any():
+        return
+
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nH, nP, nI)
+    b = model.ops.alloc2f(nH, nP)
+    pad = model.ops.alloc4f(1, nF, nH, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc_f((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc_f((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
+        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
+        vectors3f += b
+        return model.ops.maxout(vectors3f)[0]
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = cast(Floats4d, model.get_param("hidden_W").copy())
+    b = cast(Floats2d, model.get_param("hidden_b").copy())
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("hidden_W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("hidden_b", b)
+        else:
+            break
+    return model
+
+
+cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
+    output = model.get_ref("output")
+    cdef np.ndarray hidden_b = model.get_param("hidden_b")
+    cdef np.ndarray output_W = output.get_param("W")
+    cdef np.ndarray output_b = output.get_param("b")
+
+    cdef WeightsC weights
+    weights.feat_weights = feats
+    weights.feat_bias = <const float*>hidden_b.data
+    weights.hidden_weights = <const float *> output_W.data
+    weights.hidden_bias = <const float *> output_b.data
+    weights.seen_mask = <const int8_t*> seen_mask.data
+
+    return weights
+
+
+cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
+    cdef SizesC sizes
+    sizes.states = batch_size
+    sizes.classes = model.get_dim("nO")
+    sizes.hiddens = model.get_dim("nH")
+    sizes.pieces = model.get_dim("nP")
+    sizes.feats = model.get_dim("nF")
+    sizes.embed_width = model.get_dim("nI")
+    sizes.tokens = tokens
+    return sizes
+
+
+cdef ActivationsC _alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    _resize_activations(&A, n)
+    return A
+
+
+cdef void _free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
+cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
+    if n.states <= A._max_size:
+        A._curr_size = n.states
+        return
+    if A._max_size == 0:
+        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
+        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    else:
+        A.token_ids = <int*>realloc(A.token_ids,
+            n.states * n.feats * sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,
+            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>realloc(A.hiddens,
+            n.states * n.hiddens * sizeof(A.hiddens[0]))
+        A.is_valid = <int*>realloc(A.is_valid,
+            n.states * n.classes * sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    A._curr_size = n.states
+
+
+cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
+    _resize_activations(A, n)
+    for i in range(n.states):
+        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
+    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
+    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
+    for i in range(n.states):
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        for j in range(n.hiddens):
+            index = i * n.hiddens * n.pieces + j * n.pieces
+            which = arg_max(&A.unmaxed[index], n.pieces)
+            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
+    if W.hidden_weights == NULL:
+        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
+    else:
+        # Compute hidden-to-output
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
+                      1.0, <const float *>A.hiddens, n.hiddens,
+                      <const float *>W.hidden_weights, n.hiddens,
+                      0.0, scores, n.classes)
+        # Add bias
+        for i in range(n.states):
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
+    # Set unseen classes to minimum value
+    i = 0
+    min_ = scores[0]
+    for i in range(1, n.states * n.classes):
+        if scores[i] < min_:
+            min_ = scores[i]
+    for i in range(n.states):
+        for j in range(n.classes):
+            if W.seen_mask[j]:
+                scores[i*n.classes+j] = min_
+
+
+cdef void _sum_state_features(CBlas cblas, float* output,
+        const float* cached, const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    cdef int B = n.states
+    cdef int O = n.hiddens * n.pieces
+    cdef int F = n.feats
+    cdef int T = n.tokens
+    padding = cached + (T * F * O)
+    cdef int id_stride = F*O
+    cdef float one = 1.
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                feature = &padding[f*O]
+            else:
+                idx = token_ids[f] * id_stride + f*O
+                feature = &cached[idx]
+            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
+        token_ids += F
+
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index d004d313c3e..c86de231d09 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -6,6 +6,7 @@ from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
+from .batch cimport Batch
 from .search cimport Beam, MaxViolation
 from .search import MaxViolation
 from .stateclass cimport StateC, StateClass
@@ -25,7 +26,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1:
     return state.is_final()
 
 
-cdef class BeamBatch(object):
+cdef class BeamBatch(Batch):
     cdef public TransitionSystem moves
     cdef public object states
     cdef public object docs
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pxd b/spacy/pipeline/_parser_internals/_parser_utils.pxd
new file mode 100644
index 00000000000..7fee05bad60
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pxd
@@ -0,0 +1,2 @@
+cdef int arg_max(const float* scores, const int n_classes) nogil
+cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pyx b/spacy/pipeline/_parser_internals/_parser_utils.pyx
new file mode 100644
index 00000000000..582756bf5be
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pyx
@@ -0,0 +1,22 @@
+# cython: infer_types=True
+
+cdef inline int arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
+cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if is_valid[i] >= 1:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index c063cf97cd4..1b6b25e5f16 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -8,6 +8,7 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
@@ -27,7 +28,7 @@ cdef struct ArcC:
 
 
 cdef cppclass StateC:
-    int* _heads
+    vector[int] _heads
     const TokenC* _sent
     vector[int] _stack
     vector[int] _rebuffer
@@ -35,31 +36,34 @@ cdef cppclass StateC:
     unordered_map[int, vector[ArcC]] _left_arcs
     unordered_map[int, vector[ArcC]] _right_arcs
     vector[libcpp.bool] _unshiftable
+    vector[int] history
     set[int] _sent_starts
     TokenC _empty_token
     int length
     int offset
     int _b_i
 
-    __init__(const TokenC* sent, int length) nogil:
+    __init__(const TokenC* sent, int length) nogil except +:
+        this._heads.resize(length, -1)
+        this._unshiftable.resize(length, False)
+
+        # Reserve memory ahead of time to minimize allocations during parsing.
+        # The initial capacity set here ideally reflects the expected average-case/majority usage.
+        cdef int init_capacity = 32
+        this._stack.reserve(init_capacity)
+        this._rebuffer.reserve(init_capacity)
+        this._ents.reserve(init_capacity)
+        this._left_arcs.reserve(init_capacity)
+        this._right_arcs.reserve(init_capacity)
+        this.history.reserve(init_capacity)
+
         this._sent = sent
-        this._heads = <int*>calloc(length, sizeof(int))
-        if not (this._sent and this._heads):
-            with gil:
-                PyErr_SetFromErrno(MemoryError)
-                PyErr_CheckSignals()
         this.offset = 0
         this.length = length
         this._b_i = 0
-        for i in range(length):
-            this._heads[i] = -1
-            this._unshiftable.push_back(0)
         memset(&this._empty_token, 0, sizeof(TokenC))
         this._empty_token.lex = &EMPTY_LEXEME
 
-    __dealloc__():
-        free(this._heads)
-
     void set_context_tokens(int* ids, int n) nogil:
         cdef int i, j
         if n == 1:
@@ -132,19 +136,20 @@ cdef cppclass StateC:
                 ids[i] = -1
 
     int S(int i) nogil const:
-        if i >= this._stack.size():
+        cdef int stack_size = this._stack.size()
+        if i >= stack_size or i < 0:
             return -1
-        elif i < 0:
-            return -1
-        return this._stack.at(this._stack.size() - (i+1))
+        else:
+            return this._stack[stack_size - (i+1)]
 
     int B(int i) nogil const:
+        cdef int buf_size = this._rebuffer.size()
         if i < 0:
             return -1
-        elif i < this._rebuffer.size():
-            return this._rebuffer.at(this._rebuffer.size() - (i+1))
+        elif i < buf_size:
+            return this._rebuffer[buf_size - (i+1)]
         else:
-            b_i = this._b_i + (i - this._rebuffer.size())
+            b_i = this._b_i + (i - buf_size)
             if b_i >= this.length:
                 return -1
             else:
@@ -243,7 +248,7 @@ cdef cppclass StateC:
             return 0
         elif this._sent[word].sent_start == 1:
             return 1
-        elif this._sent_starts.count(word) >= 1:
+        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
             return 1
         else:
             return 0
@@ -327,7 +332,7 @@ cdef cppclass StateC:
         if item >= this._unshiftable.size():
             return 0
         else:
-            return this._unshiftable.at(item)
+            return this._unshiftable[item]
 
     void set_reshiftable(int item) nogil:
         if item < this._unshiftable.size():
@@ -347,6 +352,9 @@ cdef cppclass StateC:
         this._heads[child] = head
 
     void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
+        cdef vector[ArcC]* arcs
+        cdef ArcC* arc
+
         arcs_it = heads_arcs.find(h_i)
         if arcs_it == heads_arcs.end():
             return
@@ -355,12 +363,12 @@ cdef cppclass StateC:
         if arcs.size() == 0:
             return
 
-        arc = arcs.back()
+        arc = &arcs.back()
         if arc.head == h_i and arc.child == c_i:
             arcs.pop_back()
         else:
             for i in range(arcs.size()-1):
-                arc = arcs.at(i)
+                arc = &deref(arcs)[i]
                 if arc.head == h_i and arc.child == c_i:
                     arc.head = -1
                     arc.child = -1
@@ -400,10 +408,11 @@ cdef cppclass StateC:
         this._rebuffer = src._rebuffer
         this._sent_starts = src._sent_starts
         this._unshiftable = src._unshiftable
-        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
+        this._heads = src._heads
         this._ents = src._ents
         this._left_arcs = src._left_arcs
         this._right_arcs = src._right_arcs
         this._b_i = src._b_i
         this.offset = src.offset
         this._empty_token = src._empty_token
+        this.history = src.history
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 10f2649baa0..673e36bf5ac 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -778,6 +778,8 @@ cdef class ArcEager(TransitionSystem):
         return list(arcs)
 
     def has_gold(self, Example eg, start=0, end=None):
+        if end is not None and end < 0:
+            end = None
         for word in eg.y[start:end]:
             if word.dep != 0:
                 return True
@@ -862,6 +864,7 @@ cdef class ArcEager(TransitionSystem):
                             state.print_state()
                         )))
                     action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                     break
             else:
                 failed = False
diff --git a/spacy/pipeline/_parser_internals/batch.pxd b/spacy/pipeline/_parser_internals/batch.pxd
new file mode 100644
index 00000000000..60734e549aa
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/batch.pxd
@@ -0,0 +1,2 @@
+cdef class Batch:
+    pass
diff --git a/spacy/pipeline/_parser_internals/batch.pyx b/spacy/pipeline/_parser_internals/batch.pyx
new file mode 100644
index 00000000000..91073b52e68
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/batch.pyx
@@ -0,0 +1,52 @@
+from typing import Any
+
+TransitionSystem = Any  # TODO
+
+cdef class Batch:
+    def advance(self, scores):
+        raise NotImplementedError
+
+    def get_states(self):
+        raise NotImplementedError
+
+    @property
+    def is_done(self):
+        raise NotImplementedError
+
+    def get_unfinished_states(self):
+        raise NotImplementedError
+
+    def __getitem__(self, i):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+
+class GreedyBatch(Batch):
+    def __init__(self, moves: TransitionSystem, states, golds):
+        self._moves = moves
+        self._states = states
+        self._next_states = [s for s in states if not s.is_final()]
+
+    def advance(self, scores):
+        self._next_states = self._moves.transition_states(self._next_states, scores)
+
+    def advance_with_actions(self, actions):
+        self._next_states = self._moves.apply_actions(self._next_states, actions)
+
+    def get_states(self):
+        return self._states
+
+    @property
+    def is_done(self):
+        return all(s.is_final() for s in self._states)
+
+    def get_unfinished_states(self):
+        return [st for st in self._states if not st.is_final()]
+
+    def __getitem__(self, i):
+        return self._states[i]
+
+    def __len__(self):
+        return len(self._states)
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 6851f9f2096..cf19c834ed9 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -311,6 +311,8 @@ cdef class BiluoPushDown(TransitionSystem):
             for span in eg.y.spans.get(neg_key, []):
                 if span.start >= start and span.end <= end:
                     return True
+        if end is not None and end < 0:
+            end = None
         for word in eg.y[start:end]:
             if word.ent_iob != 0:
                 return True
@@ -648,6 +650,7 @@ cdef class Unit:
         return cost
 
 
+
 cdef class Out:
     @staticmethod
     cdef bint is_valid(const StateC* st, attr_t label) nogil:
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index 24b9f1adc33..e49ff63c48b 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -20,6 +20,10 @@ cdef class StateClass:
         if self._borrowed != 1:
             del self.c
 
+    @property
+    def history(self):
+        return list(self.c.history)
+
     @property
     def stack(self):
         return [self.S(i) for i in range(self.c.stack_depth())]
@@ -176,3 +180,6 @@ cdef class StateClass:
 
     def clone(self, StateClass src):
         self.c.clone(src.c)
+
+    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
+        self.c.set_context_tokens(&output[row, 0], n_feats)
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 04cd10d8864..66cc7747b69 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -57,3 +57,10 @@ cdef class TransitionSystem:
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        const StateC* state, gold) except -1
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+        int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index e035053b314..d1340d68c62 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -3,6 +3,8 @@
 from __future__ import print_function
 
 from cymem.cymem cimport Pool
+from libc.stdlib cimport calloc, free
+from libcpp.vector cimport vector
 
 from collections import Counter
 
@@ -11,6 +13,7 @@ import srsly
 from ...structs cimport TokenC
 from ...typedefs cimport attr_t, weight_t
 from .stateclass cimport StateClass
+from ._parser_utils cimport arg_max_if_valid
 
 from ... import util
 from ...errors import Errors
@@ -74,7 +77,18 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
+    def follow_history(self, doc, history):
+        cdef int clas
+        cdef StateClass state = StateClass(doc)
+        for clas in history:
+            action = self.c[clas]
+            action.do(state.c, action.label)
+            state.c.history.push_back(clas)
+        return state
+
     def get_oracle_sequence(self, Example example, _debug=False):
+        if not self.has_gold(example):
+            return []
         states, golds, _ = self.init_gold_batch([example])
         if not states:
             return []
@@ -86,6 +100,8 @@ cdef class TransitionSystem:
             return self.get_oracle_sequence_from_state(state, gold)
 
     def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
+        if state.is_final():
+            return []
         cdef Pool mem = Pool()
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         assert self.n_moves > 0
@@ -111,6 +127,7 @@ cdef class TransitionSystem:
                             "S0 head?", str(state.has_head(state.S(0))),
                         )))
                     action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                     break
             else:
                 if _debug:
@@ -138,6 +155,28 @@ cdef class TransitionSystem:
             raise ValueError(Errors.E170.format(name=name))
         action = self.lookup_transition(name)
         action.do(state.c, action.label)
+        state.c.history.push_back(action.clas)
+
+    def apply_actions(self, states, const int[::1] actions):
+        assert len(states) == actions.shape[0]
+        cdef StateClass state
+        cdef vector[StateC*] c_states
+        c_states.resize(len(states))
+        cdef int i
+        for (i, state) in enumerate(states):
+            c_states[i] = state.c
+        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
+        return [state for state in states if not state.c.is_final()]
+
+    def transition_states(self, states, float[:, ::1] scores):
+        assert len(states) == scores.shape[0]
+        cdef StateClass state
+        cdef float* c_scores = &scores[0, 0]
+        cdef vector[StateC*] c_states
+        for state in states:
+            c_states.push_back(state.c)
+        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
+        return [state for state in states if not state.c.is_final()]
 
     cdef Transition lookup_transition(self, object name) except *:
         raise NotImplementedError
@@ -250,3 +289,35 @@ cdef class TransitionSystem:
             self.cfg.update(msg['cfg'])
         self.initialize_actions(labels)
         return self
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil:
+        cdef int i
+        cdef Transition action
+        cdef StateC* state
+        for i in range(batch_size):
+            state = states[i]
+            action = moves.c[actions[i]]
+            action.do(state, action.label)
+            state.history.push_back(action.clas)
+
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+    int nr_class, int batch_size) nogil:
+    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
+    cdef int i, guess
+    cdef Transition action
+    for i in range(batch_size):
+        moves.set_valid(is_valid, states[i])
+        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
+        if guess == -1:
+            # This shouldn't happen, but it's hard to raise an error here,
+            # and we don't want to infinite loop. So, force to end state.
+            states[i].force_final()
+        else:
+            action = moves.c[guess]
+            action.do(states[i], action.label)
+            states[i].history.push_back(guess)
+    free(is_valid)
+
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.py
similarity index 97%
rename from spacy/pipeline/dep_parser.pyx
rename to spacy/pipeline/dep_parser.py
index 18a220bd631..370a698c25a 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.py
@@ -5,6 +5,8 @@
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
+from ._parser_internals.arc_eager import ArcEager
 
 from ._parser_internals.arc_eager cimport ArcEager
 from .transition_parser cimport Parser
@@ -19,12 +21,11 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -230,6 +231,7 @@ def parser_score(examples, **kwargs):
 
     DOCS: https://spacy.io/api/dependencyparser#score
     """
+
     def has_sents(doc):
         return doc.has_annotation("SENT_START")
 
@@ -237,8 +239,11 @@ def dep_getter(token, attr):
         dep = getattr(token, attr)
         dep = token.vocab.strings.as_string(dep).lower()
         return dep
+
     results = {}
-    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
+    results.update(
+        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
+    )
     kwargs.setdefault("getter", dep_getter)
     kwargs.setdefault("ignore_labels", ("p", "punct"))
     results.update(Scorer.score_deps(examples, "dep", **kwargs))
@@ -251,11 +256,12 @@ def make_parser_scorer():
     return parser_score
 
 
-cdef class DependencyParser(Parser):
+class DependencyParser(Parser):
     """Pipeline component for dependency parsing.
 
     DOCS: https://spacy.io/api/dependencyparser
     """
+
     TransitionSystem = ArcEager
 
     def __init__(
@@ -275,8 +281,7 @@ def __init__(
         incorrect_spans_key=None,
         scorer=parser_score,
     ):
-        """Create a DependencyParser.
-        """
+        """Create a DependencyParser."""
         super().__init__(
             vocab,
             model,
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.py
similarity index 93%
rename from spacy/pipeline/ner.pyx
rename to spacy/pipeline/ner.py
index bb009dc7a6a..4c2a3ac093c 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.py
@@ -5,6 +5,13 @@
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
+from ._parser_internals.ner import BiluoPushDown
+from ..language import Language
+from ..scorer import get_ner_prf, PRFScore
+from ..training import validate_examples
+from ..util import registry
+from ..training import remove_bilu_prefix
 
 from ._parser_internals.ner cimport BiluoPushDown
 from .transition_parser cimport Parser
@@ -16,12 +23,11 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -46,8 +52,12 @@
         "incorrect_spans_key": None,
         "scorer": {"@scorers": "spacy.ner_scorer.v1"},
     },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
-
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_ner(
     nlp: Language,
@@ -114,7 +124,12 @@ def make_ner(
         "incorrect_spans_key": None,
         "scorer": None,
     },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_beam_ner(
     nlp: Language,
@@ -188,11 +203,12 @@ def make_ner_scorer():
     return ner_score
 
 
-cdef class EntityRecognizer(Parser):
+class EntityRecognizer(Parser):
     """Pipeline component for named entity recognition.
 
     DOCS: https://spacy.io/api/entityrecognizer
     """
+
     TransitionSystem = BiluoPushDown
 
     def __init__(
@@ -210,15 +226,14 @@ def __init__(
         incorrect_spans_key=None,
         scorer=ner_score,
     ):
-        """Create an EntityRecognizer.
-        """
+        """Create an EntityRecognizer."""
         super().__init__(
             vocab,
             model,
             name,
             moves,
             update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,   # not relevant for NER
+            min_action_freq=1,  # not relevant for NER
             learn_tokens=False,  # not relevant for NER
             beam_width=beam_width,
             beam_density=beam_density,
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
deleted file mode 100644
index 7ef20563b12..00000000000
--- a/spacy/pipeline/transition_parser.pxd
+++ /dev/null
@@ -1,31 +0,0 @@
-from cymem.cymem cimport Pool
-from thinc.backends.cblas cimport CBlas
-
-from ..ml.parser_model cimport ActivationsC, SizesC, WeightsC
-from ..vocab cimport Vocab
-from ._parser_internals._state cimport StateC
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
-from .trainable_pipe cimport TrainablePipe
-
-
-cdef class Parser(TrainablePipe):
-    cdef public object _rehearsal_model
-    cdef readonly TransitionSystem moves
-    cdef public object _multitasks
-    cdef object _cpu_ops
-
-    cdef void _parseC(
-        self,
-        CBlas cblas,
-        StateC** states,
-        WeightsC weights,
-        SizesC sizes
-    ) nogil
-
-    cdef void c_transition_batch(
-        self,
-        StateC** states,
-        const float* scores,
-        int nr_class,
-        int batch_size
-    ) nogil
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index feab7e7404b..d71a4ab0355 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -13,30 +13,29 @@ from libc.string cimport memset
 from libcpp.vector cimport vector
 
 import random
+import contextlib
 
 import srsly
 from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
-from thinc.api import chain, softmax_activation, use_ops
+from thinc.api import chain, softmax_activation, use_ops, get_array_module
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d
+from thinc.types import Floats2d, Ints1d
 import numpy.random
 import numpy
 import numpy.random
 import srsly
 from thinc.api import CupyOps, NumpyOps, set_dropout_rate
 
-from ._parser_internals.stateclass cimport StateClass
+from ..ml.tb_framework import TransitionModelInputs
+from ._parser_internals.stateclass cimport StateC, StateClass
 from ._parser_internals.search cimport Beam
-from ..ml.parser_model cimport alloc_activations, free_activations
-from ..ml.parser_model cimport predict_states, arg_max_if_valid
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
-from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
-from ._parser_internals.stateclass cimport StateClass
-
-from .trainable_pipe import TrainablePipe
-
+from .trainable_pipe cimport TrainablePipe
 from ._parser_internals cimport _beam_utils
+from ._parser_internals import _beam_utils
+from ..vocab cimport Vocab
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ..typedefs cimport weight_t
 
 from ..training import validate_examples, validate_get_examples
 from ..training import validate_distillation_examples
@@ -49,7 +48,7 @@ from ._parser_internals import _beam_utils
 NUMPY_OPS = NumpyOps()
 
 
-cdef class Parser(TrainablePipe):
+class Parser(TrainablePipe):
     """
     Base class of the DependencyParser and EntityRecognizer.
     """
@@ -149,8 +148,9 @@ cdef class Parser(TrainablePipe):
     @property
     def move_names(self):
         names = []
+        cdef TransitionSystem moves = self.moves
         for i in range(self.moves.n_moves):
-            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
+            name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
             # Explicitly removing the internal "U-" token used for blocking entities
             if name != "U-":
                 names.append(name)
@@ -256,15 +256,6 @@ cdef class Parser(TrainablePipe):
 
         student_docs = [eg.predicted for eg in examples]
 
-        teacher_step_model = teacher_pipe.model.predict([eg.reference for eg in examples])
-        student_step_model, backprop_tok2vec = self.model.begin_update(student_docs)
-
-        # Add softmax activation, so that we can compute student losses
-        # with cross-entropy loss.
-        with use_ops("numpy"):
-            teacher_model = chain(teacher_step_model, softmax_activation())
-            student_model = chain(student_step_model, softmax_activation())
-        
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
@@ -272,51 +263,39 @@ cdef class Parser(TrainablePipe):
             # sequence, we use the teacher's predictions as the gold
             # standard.
             max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states = self._init_batch(teacher_step_model, student_docs, max_moves)
+            states = self._init_batch(teacher_pipe, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
-        loss = 0.0
-        n_moves = 0
-        while states:
-            # We do distillation as follows: (1) for every state, we compute the
-            # transition softmax distributions: (2) we backpropagate the error of
-            # the student (compared to the teacher) into the student model; (3)
-            # for all states, we move to the next state using the student's
-            # predictions.
-            teacher_scores = teacher_model.predict(states)
-            student_scores, backprop = student_model.begin_update(states)
-            state_loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
-            backprop(d_scores)
-            loss += state_loss
-            self.transition_states(states, student_scores)
-            states = [state for state in states if not state.is_final()]
-
-            # Stop when we reach the maximum number of moves, otherwise we start
-            # to process the remainder of cut sequences again.
-            if max_moves >= 1 and n_moves >= max_moves:
-                break
-            n_moves += 1
+        # We distill as follows: 1. we first let the student predict transition
+        # sequences (and the corresponding transition probabilities); (2) we
+        # let the teacher follow the student's predicted transition sequences
+        # to obtain the teacher's transition probabilities; (3) we compute the
+        # gradients of the student's transition distributions relative to the
+        # teacher's distributions.
+
+        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
+            max_moves=max_moves)
+        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
+        actions = states2actions(student_states)
+        teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
+            moves=self.moves, actions=actions)
+        (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
-        backprop_tok2vec(student_docs)
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+        backprop_scores((student_states, d_scores))
 
         if sgd is not None:
             self.finish_update(sgd)
 
         losses[self.name] += loss
 
-        del backprop
-        del backprop_tok2vec
-        teacher_step_model.clear_memory()
-        student_step_model.clear_memory()
-        del teacher_model
-        del student_model
-
         return losses
 
 
     def get_teacher_student_loss(
-        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
+        normalize: bool=False,
     ) -> Tuple[float, List[Floats2d]]:
         """Calculate the loss and its gradient for a batch of student
         scores, relative to teacher scores.
@@ -328,10 +307,28 @@ cdef class Parser(TrainablePipe):
         
         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
-        d_scores, loss = loss_func(student_scores, teacher_scores)
-        if self.model.ops.xp.isnan(loss):
-            raise ValueError(Errors.E910.format(name=self.name))
+
+        # We can't easily hook up a softmax layer in the parsing model, since
+        # the get_loss does additional masking. So, we could apply softmax
+        # manually here and use Thinc's cross-entropy loss. But it's a bit
+        # suboptimal, since we can have a lot of states that would result in
+        # many kernel launches. Futhermore the parsing model's backprop expects
+        # a XP array, so we'd have to concat the softmaxes anyway. So, like
+        # the get_loss implementation, we'll compute the loss and gradients
+        # ourselves.
+
+        teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
+            axis=-1, inplace=True)
+        student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
+            axis=-1, inplace=True)
+
+        assert teacher_scores.shape == student_scores.shape
+
+        d_scores = student_scores - teacher_scores
+        if normalize:
+            d_scores /= d_scores.shape[0]
+        loss = (d_scores**2).sum() / d_scores.size
+
         return float(loss), d_scores
 
     def init_multitask_objectives(self, get_examples, pipeline, **cfg):
@@ -354,9 +351,6 @@ cdef class Parser(TrainablePipe):
 
         stream: The sequence of documents to process.
         batch_size (int): Number of documents to accumulate into a working set.
-        error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
-            deals with a failing batch of documents. The default function just reraises
-            the exception.
 
         YIELDS (Doc): Documents, in order.
         """
@@ -377,78 +371,29 @@ cdef class Parser(TrainablePipe):
     def predict(self, docs):
         if isinstance(docs, Doc):
             docs = [docs]
+        self._ensure_labels_are_added(docs)
         if not any(len(doc) for doc in docs):
             result = self.moves.init_batch(docs)
             return result
-        if self.cfg["beam_width"] == 1:
-            return self.greedy_parse(docs, drop=0.0)
-        else:
-            return self.beam_parse(
-                docs,
-                drop=0.0,
-                beam_width=self.cfg["beam_width"],
-                beam_density=self.cfg["beam_density"]
-            )
+        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            states_or_beams, _ = self.model.predict(inputs)
+        return states_or_beams
 
     def greedy_parse(self, docs, drop=0.):
-        cdef vector[StateC*] states
-        cdef StateClass state
-        cdef CBlas cblas = self._cpu_ops.cblas()
+        self._resize()
         self._ensure_labels_are_added(docs)
-        set_dropout_rate(self.model, drop)
-        batch = self.moves.init_batch(docs)
-        model = self.model.predict(docs)
-        weights = get_c_weights(model)
-        for state in batch:
-            if not state.is_final():
-                states.push_back(state.c)
-        sizes = get_c_sizes(model, states.size())
-        with nogil:
-            self._parseC(cblas, &states[0], weights, sizes)
-        model.clear_memory()
-        del model
-        return batch
+        with _change_attrs(self.model, beam_width=1):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            states, _ = self.model.predict(inputs)
+        return states
 
     def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
         self._ensure_labels_are_added(docs)
-        batch = _beam_utils.BeamBatch(
-            self.moves,
-            self.moves.init_batch(docs),
-            None,
-            beam_width,
-            density=beam_density
-        )
-        model = self.model.predict(docs)
-        while not batch.is_done:
-            states = batch.get_unfinished_states()
-            if not states:
-                break
-            scores = model.predict(states)
-            batch.advance(scores)
-        model.clear_memory()
-        del model
-        return list(batch)
-
-    cdef void _parseC(
-        self, CBlas cblas, StateC** states, WeightsC weights, SizesC sizes
-    ) nogil:
-        cdef int i
-        cdef vector[StateC*] unfinished
-        cdef ActivationsC activations = alloc_activations(sizes)
-        while sizes.states >= 1:
-            predict_states(cblas, &activations, states, &weights, sizes)
-            # Validate actions, argmax, take action.
-            self.c_transition_batch(
-                states, activations.scores, sizes.classes, sizes.states
-            )
-            for i in range(sizes.states):
-                if not states[i].is_final():
-                    unfinished.push_back(states[i])
-            for i in range(unfinished.size()):
-                states[i] = unfinished[i]
-            sizes.states = unfinished.size()
-            unfinished.clear()
-        free_activations(&activations)
+        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            beams, _ = self.model.predict(inputs)
+        return beams
 
     def set_annotations(self, docs, states_or_beams):
         cdef StateClass state
@@ -459,40 +404,6 @@ cdef class Parser(TrainablePipe):
             for hook in self.postprocesses:
                 hook(doc)
 
-    def transition_states(self, states, float[:, ::1] scores):
-        cdef StateClass state
-        cdef float* c_scores = &scores[0, 0]
-        cdef vector[StateC*] c_states
-        for state in states:
-            c_states.push_back(state.c)
-        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
-        return [state for state in states if not state.c.is_final()]
-
-    cdef void c_transition_batch(
-        self,
-        StateC** states,
-        const float* scores,
-        int nr_class,
-        int batch_size
-    ) nogil:
-        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        with gil:
-            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
-        is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
-        cdef int i, guess
-        cdef Transition action
-        for i in range(batch_size):
-            self.moves.set_valid(is_valid, states[i])
-            guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
-            if guess == -1:
-                # This shouldn't happen, but it's hard to raise an error here,
-                # and we don't want to infinite loop. So, force to end state.
-                states[i].force_final()
-            else:
-                action = self.moves.c[guess]
-                action.do(states[i], action.label)
-        free(is_valid)
-
     def update(self, examples, *, drop=0., sgd=None, losses=None):
         if losses is None:
             losses = {}
@@ -503,66 +414,99 @@ cdef class Parser(TrainablePipe):
         )
         for multitask in self._multitasks:
             multitask.update(examples, drop=drop, sgd=sgd)
+        # We need to take care to act on the whole batch, because we might be
+        # getting vectors via a listener.
         n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
         if n_examples == 0:
             return losses
         set_dropout_rate(self.model, drop)
-        # The probability we use beam update, instead of falling back to
-        # a greedy update
-        beam_update_prob = self.cfg["beam_update_prob"]
-        if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
-            return self.update_beam(
-                examples,
-                beam_width=self.cfg["beam_width"],
-                sgd=sgd,
-                losses=losses,
-                beam_density=self.cfg["beam_density"]
-            )
+        docs = [eg.x for eg in examples if len(eg.x)]
+
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
             # batch uniform length.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states, golds, _ = self._init_gold_batch(
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
+            init_states, gold_states, _ = self._init_gold_batch(
                 examples,
                 max_length=max_moves
             )
         else:
-            states, golds, _ = self.moves.init_gold_batch(examples)
-        if not states:
-            return losses
-        model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
-
-        states_golds = list(zip(states, golds))
-        n_moves = 0
-        while states_golds:
-            states, golds = zip(*states_golds)
-            scores, backprop = model.begin_update(states)
-            d_scores = self.get_batch_loss(states, golds, scores, losses)
-            # Note that the gradient isn't normalized by the batch size
-            # here, because our "samples" are really the states...But we
-            # can't normalize by the number of states either, as then we'd
-            # be getting smaller gradients for states in long sequences.
-            backprop(d_scores)
-            # Follow the predicted action
-            self.transition_states(states, scores)
-            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
-            if max_moves >= 1 and n_moves >= max_moves:
-                break
-            n_moves += 1
+            init_states, gold_states, _ = self.moves.init_gold_batch(examples)
 
-        backprop_tok2vec(golds)
+        inputs = TransitionModelInputs(docs=docs, moves=self.moves,
+            max_moves=max_moves, states=[state.copy() for state in init_states])
+        (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
+        if sum(s.shape[0] for s in scores) == 0:
+            return losses
+        d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
+            examples, max_moves)
+        backprop_scores((pred_states, d_scores))
         if sgd not in (None, False):
             self.finish_update(sgd)
+        losses[self.name] += float((d_scores**2).sum())
         # Ugh, this is annoying. If we're working on GPU, we want to free the
         # memory ASAP. It seems that Python doesn't necessarily get around to
         # removing these in time if we don't explicitly delete? It's confusing.
-        del backprop
-        del backprop_tok2vec
-        model.clear_memory()
-        del model
+        del backprop_scores
         return losses
 
+    def get_loss(self, states_scores, examples, max_moves):
+        gold_states, init_states, pred_states, scores = states_scores
+        scores = self.model.ops.xp.vstack(scores)
+        costs = self._get_costs_from_histories(
+            examples,
+            gold_states,
+            init_states,
+            [list(state.history) for state in pred_states],
+            max_moves
+        )
+        xp = get_array_module(scores)
+        best_costs = costs.min(axis=1, keepdims=True)
+        gscores = scores.copy()
+        min_score = scores.min() - 1000
+        assert costs.shape == scores.shape, (costs.shape, scores.shape)
+        gscores[costs > best_costs] = min_score
+        max_ = scores.max(axis=1, keepdims=True)
+        gmax = gscores.max(axis=1, keepdims=True)
+        exp_scores = xp.exp(scores - max_)
+        exp_gscores = xp.exp(gscores - gmax)
+        Z = exp_scores.sum(axis=1, keepdims=True)
+        gZ = exp_gscores.sum(axis=1, keepdims=True)
+        d_scores = exp_scores / Z
+        d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
+        return d_scores
+
+    def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
+        cdef TransitionSystem moves = self.moves
+        cdef StateClass state
+        cdef int clas
+        cdef int nF = self.model.get_dim("nF")
+        cdef int nO = moves.n_moves
+        cdef int nS = sum([len(history) for history in histories])
+        cdef Pool mem = Pool()
+        cdef np.ndarray costs_i
+        is_valid = <int*>mem.alloc(nO, sizeof(int))
+        batch = list(zip(init_states, histories, gold_states))
+        n_moves = 0
+        output = []
+        while batch:
+            costs = numpy.zeros((len(batch), nO), dtype="f")
+            for i, (state, history, gold) in enumerate(batch):
+                costs_i = costs[i]
+                clas = history.pop(0)
+                moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
+                action = moves.c[clas]
+                action.do(state.c, action.label)
+                state.c.history.push_back(clas)
+            output.append(costs)
+            batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
+            if n_moves >= max_moves >= 1:
+                break
+            n_moves += 1
+
+        return self.model.ops.xp.vstack(output)
+
     def rehearse(self, examples, sgd=None, losses=None, **cfg):
         """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
         if losses is None:
@@ -572,10 +516,9 @@ cdef class Parser(TrainablePipe):
                 multitask.rehearse(examples, losses=losses, sgd=sgd)
         if self._rehearsal_model is None:
             return None
-        losses.setdefault(self.name, 0.)
+        losses.setdefault(self.name, 0.0)
         validate_examples(examples, "Parser.rehearse")
         docs = [eg.predicted for eg in examples]
-        states = self.moves.init_batch(docs)
         # This is pretty dirty, but the NER can resize itself in init_batch,
         # if labels are missing. We therefore have to check whether we need to
         # expand our model output.
@@ -583,95 +526,33 @@ cdef class Parser(TrainablePipe):
         # Prepare the stepwise model, and get the callback for finishing the batch
         set_dropout_rate(self._rehearsal_model, 0.0)
         set_dropout_rate(self.model, 0.0)
-        tutor, _ = self._rehearsal_model.begin_update(docs)
-        model, backprop_tok2vec = self.model.begin_update(docs)
-        n_scores = 0.
-        loss = 0.
-        while states:
-            targets, _ = tutor.begin_update(states)
-            guesses, backprop = model.begin_update(states)
-            d_scores = (guesses - targets) / targets.shape[0]
-            # If all weights for an output are 0 in the original model, don't
-            # supervise that output. This allows us to add classes.
-            loss += (d_scores**2).sum()
-            backprop(d_scores)
-            # Follow the predicted action
-            self.transition_states(states, guesses)
-            states = [state for state in states if not state.is_final()]
-            n_scores += d_scores.size
-        # Do the backprop
-        backprop_tok2vec(docs)
-        if sgd is not None:
-            self.finish_update(sgd)
-        losses[self.name] += loss / n_scores
-        del backprop
-        del backprop_tok2vec
-        model.clear_memory()
-        tutor.clear_memory()
-        del model
-        del tutor
-        return losses
+        student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
+        actions = states2actions(student_states)
+        teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
+        _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
+
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
+
+        teacher_scores = self.model.ops.xp.vstack(teacher_scores)
+        student_scores = self.model.ops.xp.vstack(student_scores)
+        assert teacher_scores.shape == student_scores.shape
+
+        d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
+        # If all weights for an output are 0 in the original model, don't
+        # supervise that output. This allows us to add classes.
+        loss = (d_scores**2).sum() / d_scores.size
+        backprop_scores((student_states, d_scores))
 
-    def update_beam(
-        self,
-        examples,
-        *,
-        beam_width,
-        drop=0.,
-        sgd=None,
-        losses=None,
-        beam_density=0.0
-    ):
-        states, golds, _ = self.moves.init_gold_batch(examples)
-        if not states:
-            return losses
-        # Prepare the stepwise model, and get the callback for finishing the batch
-        model, backprop_tok2vec = self.model.begin_update(
-            [eg.predicted for eg in examples])
-        loss = _beam_utils.update_beam(
-            self.moves,
-            states,
-            golds,
-            model,
-            beam_width,
-            beam_density=beam_density,
-        )
-        losses[self.name] += loss
-        backprop_tok2vec(golds)
         if sgd is not None:
             self.finish_update(sgd)
+        losses[self.name] += loss
 
-    def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
-        cdef StateClass state
-        cdef Pool mem = Pool()
-        cdef int i
-
-        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
+        return losses
 
-        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
-        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
-        cdef np.ndarray d_scores = numpy.zeros(
-            (len(states), self.moves.n_moves), dtype='f', order='C'
-        )
-        c_d_scores = <float*>d_scores.data
-        unseen_classes = self.model.attrs["unseen_classes"]
-        for i, (state, gold) in enumerate(zip(states, golds)):
-            memset(is_valid, 0, self.moves.n_moves * sizeof(int))
-            memset(costs, 0, self.moves.n_moves * sizeof(float))
-            self.moves.set_costs(is_valid, costs, state.c, gold)
-            for j in range(self.moves.n_moves):
-                if costs[j] <= 0.0 and j in unseen_classes:
-                    unseen_classes.remove(j)
-            cpu_log_loss(
-                c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]
-            )
-            c_d_scores += d_scores.shape[1]
-        # Note that we don't normalize this. See comment in update() for why.
-        if losses is not None:
-            losses.setdefault(self.name, 0.)
-            losses[self.name] += (d_scores**2).sum()
-        return d_scores
+    def update_beam(self, examples, *, beam_width,
+            drop=0., sgd=None, losses=None, beam_density=0.0):
+        raise NotImplementedError
 
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
@@ -710,7 +591,7 @@ cdef class Parser(TrainablePipe):
             for example in islice(get_examples(), 10):
                 doc_sample.append(example.predicted)
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize(doc_sample)
+        self.model.initialize((doc_sample, self.moves))
         if nlp is not None:
             self.init_multitask_objectives(get_examples, nlp.pipeline)
 
@@ -803,26 +684,27 @@ cdef class Parser(TrainablePipe):
 
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
-        sequence or a cap. A long
-        doc will get multiple states. Let's say we have a doc of length 2*N,
-        where N is the shortest doc. We'll make two states, one representing
-        long_doc[:N], and another representing long_doc[N:]."""
+        sequence or a cap. A long doc will get multiple states. Let's say we
+        have a doc of length 2*N, where N is the shortest doc. We'll make
+        two states, one representing long_doc[:N], and another representing
+        long_doc[N:]."""
         cdef:
             StateClass start_state
             StateClass state
             Transition action
-        all_states = self.moves.init_batch([eg.predicted for eg in examples])
+            TransitionSystem moves = self.moves
+        all_states = moves.init_batch([eg.predicted for eg in examples])
         states = []
         golds = []
         to_cut = []
         for state, eg in zip(all_states, examples):
-            if self.moves.has_gold(eg) and not state.is_final():
-                gold = self.moves.init_gold(state, eg)
+            if moves.has_gold(eg) and not state.is_final():
+                gold = moves.init_gold(state, eg)
                 if len(eg.x) < max_length:
                     states.append(state)
                     golds.append(gold)
                 else:
-                    oracle_actions = self.moves.get_oracle_sequence_from_state(
+                    oracle_actions = moves.get_oracle_sequence_from_state(
                         state.copy(), gold)
                     to_cut.append((eg, state, gold, oracle_actions))
         if not to_cut:
@@ -832,13 +714,52 @@ cdef class Parser(TrainablePipe):
             for i in range(0, len(oracle_actions), max_length):
                 start_state = state.copy()
                 for clas in oracle_actions[i:i+max_length]:
-                    action = self.moves.c[clas]
+                    action = moves.c[clas]
                     action.do(state.c, action.label)
                     if state.is_final():
                         break
-                if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
+                if moves.has_gold(eg, start_state.B(0), state.B(0)):
                     states.append(start_state)
                     golds.append(gold)
                 if state.is_final():
                     break
         return states, golds, max_length
+
+
+@contextlib.contextmanager
+def _change_attrs(model, **kwargs):
+    """Temporarily modify a thinc model's attributes."""
+    unset = object()
+    old_attrs = {}
+    for key, value in kwargs.items():
+        old_attrs[key] = model.attrs.get(key, unset)
+        model.attrs[key] = value
+    yield model
+    for key, value in old_attrs.items():
+        if value is unset:
+            model.attrs.pop(key)
+        else:
+            model.attrs[key] = value
+
+
+def states2actions(states: List[StateClass]) -> List[Ints1d]:
+    cdef int step
+    cdef StateClass state
+    cdef StateC* c_state
+    actions = []
+    while True:
+        step = len(actions)
+
+        step_actions = []
+        for state in states:
+            c_state = state.c
+            if step < c_state.history.size():
+                step_actions.append(c_state.history[step])
+
+        # We are done if we have exhausted all histories.
+        if len(step_actions) == 0:
+            break
+
+        actions.append(numpy.array(step_actions, dtype="i"))
+
+    return actions
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 54ee053981f..b2c39ae88bc 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -16,6 +16,8 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
+from thinc.api import fix_random_seed
+import logging
 
 from ..util import make_tempdir
 
@@ -412,7 +414,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -539,11 +541,11 @@ def test_block_ner():
     assert [token.ent_type_ for token in doc] == expected_types
 
 
-@pytest.mark.parametrize("use_upper", [True, False])
-def test_overfitting_IO(use_upper):
+def test_overfitting_IO():
+    fix_random_seed(1)
     # Simple test to try and quickly overfit the NER component
     nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
+    ner = nlp.add_pipe("ner", config={"model": {}})
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -575,7 +577,6 @@ def test_overfitting_IO(use_upper):
         assert ents2[0].label_ == "LOC"
         # Ensure that the predictions are still the same, even after adding a new label
         ner2 = nlp2.get_pipe("ner")
-        assert ner2.model.attrs["has_upper"] == use_upper
         ner2.add_label("RANDOM_NEW_LABEL")
         doc3 = nlp2(test_text)
         ents3 = doc3.ents
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index a943c3538e0..a6e1852514d 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,16 +1,17 @@
+import itertools
 import pytest
+import numpy
 from numpy.testing import assert_equal
 from thinc.api import Adam
 
 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.pipeline import DependencyParser
-from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
-from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
-from spacy.tokens import Doc
 from spacy.training import Example
+from spacy.tokens import Doc
 from spacy.vocab import Vocab
+from spacy import util, registry
+from thinc.api import fix_random_seed
 
 from ..util import apply_transition_sequence, make_tempdir
 
@@ -59,6 +60,8 @@
     ),
 ]
 
+PARSERS = ["parser"]  # TODO: Test beam_parser when ready
+
 eps = 0.1
 
 
@@ -171,6 +174,57 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
     assert doc[0].dep != 0
 
 
+def test_parser_apply_actions(en_vocab, en_parser):
+    words = ["I", "ate", "pizza"]
+    words2 = ["Eat", "more", "pizza", "!"]
+    doc1 = Doc(en_vocab, words=words)
+    doc2 = Doc(en_vocab, words=words2)
+    docs = [doc1, doc2]
+
+    moves = en_parser.moves
+    moves.add_action(0, "")
+    moves.add_action(1, "")
+    moves.add_action(2, "nsubj")
+    moves.add_action(3, "obj")
+    moves.add_action(2, "amod")
+
+    actions = [
+        numpy.array([0, 0], dtype="i"),
+        numpy.array([2, 0], dtype="i"),
+        numpy.array([0, 4], dtype="i"),
+        numpy.array([3, 3], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([0], dtype="i"),
+        numpy.array([1], dtype="i"),
+    ]
+
+    states = moves.init_batch(docs)
+    active_states = states
+
+    for step_actions in actions:
+        active_states = moves.apply_actions(active_states, step_actions)
+
+    assert len(active_states) == 0
+
+    for (state, doc) in zip(states, docs):
+        moves.set_annotations(state, doc)
+
+    assert docs[0][0].head.i == 1
+    assert docs[0][0].dep_ == "nsubj"
+    assert docs[0][1].head.i == 1
+    assert docs[0][1].dep_ == "ROOT"
+    assert docs[0][2].head.i == 1
+    assert docs[0][2].dep_ == "obj"
+
+    assert docs[1][0].head.i == 0
+    assert docs[1][0].dep_ == "ROOT"
+    assert docs[1][1].head.i == 2
+    assert docs[1][1].dep_ == "amod"
+    assert docs[1][2].head.i == 0
+    assert docs[1][2].dep_ == "obj"
+
+
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
@@ -319,7 +373,7 @@ def test_parser_constructor(en_vocab):
     DependencyParser(en_vocab, model)
 
 
-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+@pytest.mark.parametrize("pipe_name", PARSERS)
 def test_incomplete_data(pipe_name):
     # Test that the parser works with incomplete information
     nlp = English()
@@ -345,11 +399,15 @@ def test_incomplete_data(pipe_name):
     assert doc[2].head.i == 1
 
 
-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
-def test_overfitting_IO(pipe_name):
+@pytest.mark.parametrize(
+    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
+)
+def test_overfitting_IO(pipe_name, max_moves):
+    fix_random_seed(0)
     # Simple test to try and quickly overfit the dependency parser (normal or beam)
     nlp = English()
     parser = nlp.add_pipe(pipe_name)
+    parser.cfg["update_with_oracle_cut_size"] = max_moves
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -451,10 +509,12 @@ def test_distill():
 @pytest.mark.parametrize(
     "parser_config",
     [
-        # TransitionBasedParser V1
-        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        # TransitionBasedParser V2
+        # TODO: re-enable after we have a spacy-legacy release for v4. See
+        # https://github.com/explosion/spacy-legacy/pull/36
+        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
         ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
+        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
     ],
 )
 # fmt: on
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 998f0472c7e..9648341a106 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -384,7 +384,7 @@ def test_replace_listeners():
     factory = "ner"
 
     [components.ner.model]
-    @architectures = "spacy.TransitionBasedParser.v2"
+    @architectures = "spacy.TransitionBasedParser.v3"
 
     [components.ner.model.tok2vec]
     @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index b36d3ad7473..dd0a53c910e 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -130,33 +130,11 @@
 
 parser_config_string_upper = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
-use_upper = true
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = null
-width = 333
-depth = 4
-embed_size = 5555
-window_size = 1
-maxout_pieces = 7
-subword_features = false
-"""
-
-
-parser_config_string_no_upper = """
-[model]
-@architectures = "spacy.TransitionBasedParser.v2"
-state_type = "parser"
-extra_state_tokens = false
-hidden_width = 66
-maxout_pieces = 2
-use_upper = false
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
@@ -187,7 +165,6 @@ def my_parser():
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
-        use_upper=True,
     )
     return parser
 
@@ -293,15 +270,16 @@ def test_serialize_custom_nlp():
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        assert model.get_ref("upper").get_dim("nI") == 65
-        assert model.get_ref("lower").get_dim("nI") == 65
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("W")
+        assert output.has_param("b")
 
 
-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_serialize_parser(parser_config_string):
     """Create a non-default parser config to check nlp serializes it correctly"""
     nlp = English()
@@ -314,11 +292,13 @@ def test_serialize_parser(parser_config_string):
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        if model.attrs["has_upper"]:
-            assert model.get_ref("upper").get_dim("nI") == 66
-        assert model.get_ref("lower").get_dim("nI") == 66
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("b")
+        assert output.has_param("W")
 
 
 def test_config_nlp_roundtrip():
@@ -514,9 +494,7 @@ def test_config_auto_fill_extra_fields():
     load_model_from_config(nlp.config)
 
 
-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_config_validate_literal(parser_config_string):
     nlp = English()
     config = Config().from_str(parser_config_string)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index d2a41ff0fed..160e2ecf335 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,24 +1,13 @@
 import ctypes
 import os
 from pathlib import Path
-
-import pytest
-
-try:
-    from pydantic.v1 import ValidationError
-except ImportError:
-    from pydantic import ValidationError  # type: ignore
-
-from thinc.api import (
-    Config,
-    ConfigValidationError,
-    CupyOps,
-    MPSOps,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-    set_current_ops,
-)
+from spacy.about import __version__ as spacy_version
+from spacy import util
+from spacy import prefer_gpu, require_gpu, require_cpu
+from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
+from spacy.util import find_available_port
+from thinc.api import Config, Optimizer, ConfigValidationError
+from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
@@ -101,34 +90,6 @@ def test_util_get_package_path(package):
     assert isinstance(path, Path)
 
 
-def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
-    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
-    assert model.get_param("W").shape == (nF, nO, nP, nI)
-    tensor = model.ops.alloc((10, nI))
-    Y, get_dX = model.begin_update(tensor)
-    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
-    dY = model.ops.alloc((15, nO, nP))
-    ids = model.ops.alloc((15, nF))
-    ids[1, 2] = -1
-    dY[1] = 1
-    assert not model.has_grad("pad")
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 2, 0, 0] == 1.0
-    ids.fill(0.0)
-    dY.fill(0.0)
-    dY[0] = 0
-    ids[1, 2] = 0
-    ids[1, 1] = -1
-    ids[1, 0] = -1
-    dY[1] = 1
-    ids[2, 0] = -1
-    dY[2] = 5
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 0, 0, 0] == 6
-    assert d_pad[0, 1, 0, 0] == 1
-    assert d_pad[0, 2, 0, 0] == 0
-
-
 def test_prefer_gpu():
     current_ops = get_current_ops()
     if has_cupy_gpu:
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 914e877f579..b2c93f24bfa 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,6 +1,5 @@
 # cython: profile=False
 from collections.abc import Iterable as IterableInstance
-
 import numpy
 
 from murmurhash.mrmr cimport hash64
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 956234ac0d4..db8f974ea19 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -833,18 +833,17 @@ for a Tok2Vec layer.
 
 ## Parser & NER architectures {id="parser"}
 
-### spacy.TransitionBasedParser.v2 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
+### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TransitionBasedParser.v2"
+> @architectures = "spacy.TransitionBasedParser.v3"
 > state_type = "ner"
 > extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
-> use_upper = true
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v2"
@@ -874,23 +873,22 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
-| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
-| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
-| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
-| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
-| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
-| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
+| Name                 | Description                                                                                                                                                       |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                        |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                               |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                            |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~                                                             |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                       |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                     |
 
 <Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>
 
 [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
 same signature, but the `use_upper` argument was `True` by default.
 
-</Accordion>
+ </Accordion>
 
 ## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 47028f4a2e7..acc2ce1caa2 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -408,7 +408,7 @@ Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v3
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
@@ -418,7 +418,7 @@ Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v3
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]
@@ -743,7 +743,7 @@ scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 - hidden_width = 64
@@ -766,7 +766,7 @@ scorer = {"@scorers":"spacy.parser_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx
index b44df538766..44c80622437 100644
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@@ -302,7 +302,7 @@ the others, but may not be as accurate, especially if texts are short.
 ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 
 Identical to
-[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
+[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser)
 except the `use_upper` was set to `True` by default.
 
 ## Layers {id="layers"}
diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 2bd2856b6a3..1b0bc9606e9 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -140,7 +140,7 @@ factory = "tok2vec"
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v3"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -156,7 +156,7 @@ same. This makes them fully independent and doesn't require an upstream
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v3"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
@@ -472,7 +472,7 @@ sneakily delegates to the `Transformer` pipeline component.
 factory = "ner"
 
 [nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 128

From 2920a05e08b7b696ad71e12bd37f6b6cad6a2b18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 18 Jan 2023 18:28:30 +0100
Subject: [PATCH 038/504] Fix batching regression (#12094)

* Fix batching regression

Some time ago, the spaCy v4 branch switched to the new Thinc v9
schedule. However, this introduced an error in how batching is handed.

In the PR, the batchers were changed to keep track of their step,
so that the step can be passed to the schedule. However, the issue
is that the training loop repeatedly calls the batching functions
(rather than using an infinite generator/iterator). So, the step and
therefore the schedule would be reset each epoch. Before the schedule
switch we didn't have this issue, because the old schedules were
stateful.

This PR fixes this issue by reverting the batching functions to use
a (stateful) generator. Their registry functions do accept a `Schedule`
and we convert `Schedule`s to generators.

* Update batcher docs

* Docstring fixes

* Make minibatch take iterables again as well

* Bump thinc requirement to 9.0.0.dev2

* Use type declaration

* Convert another comment into a proper type declaration
---
 pyproject.toml                        |  2 +-
 requirements.txt                      |  2 +-
 setup.cfg                             |  2 +-
 spacy/tests/pipeline/test_tagger.py   |  4 +-
 spacy/tests/pipeline/test_textcat.py  |  8 +++-
 spacy/tests/training/test_training.py |  4 +-
 spacy/training/batchers.py            | 58 ++++++++++++++-------------
 spacy/util.py                         |  8 ++--
 website/docs/api/top-level.mdx        | 30 +++++++-------
 9 files changed, 64 insertions(+), 54 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 58e4382e829..3891d137867 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev1,<9.1.0",
+    "thinc>=9.0.0.dev2,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 8224ab783f9..6824be1cee3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev1,<9.1.0
+thinc>=9.0.0.dev2,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 6c3edd83e73..9a9b530071b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,7 +38,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev1,<9.1.0
+    thinc>=9.0.0.dev2,<9.1.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 5da5c209975..b6f94f7f97b 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -24,7 +24,9 @@ def test_issue4348():
     optimizer = nlp.initialize()
     for i in range(5):
         losses = {}
-        batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+        batches = util.minibatch(
+            TRAIN_DATA, size=compounding(4.0, 32.0, 1.001).to_generator()
+        )
         for batch in batches:
             nlp.update(batch, sgd=optimizer, losses=losses)
 
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 214c1bfbed1..2383c36bb01 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -100,7 +100,9 @@ def test_issue3611():
         optimizer = nlp.initialize()
         for i in range(3):
             losses = {}
-            batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+            batches = util.minibatch(
+                train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
+            )
 
             for batch in batches:
                 nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
@@ -137,7 +139,9 @@ def test_issue4030():
         optimizer = nlp.initialize()
         for i in range(3):
             losses = {}
-            batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+            batches = util.minibatch(
+                train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
+            )
 
             for batch in batches:
                 nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 68f7e8a0d57..ef20ec365c6 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -924,7 +924,9 @@ def _train_tuples(train_data):
     optimizer = nlp.initialize()
     for i in range(5):
         losses = {}
-        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
+        batches = minibatch(
+            train_examples, size=compounding(4.0, 32.0, 1.001).to_generator()
+        )
         for batch in batches:
             nlp.update(batch, sgd=optimizer, losses=losses)
 
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index 519e61315da..469bb263016 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,9 +1,10 @@
 import itertools
-from thinc.schedules import Schedule, constant as constant_schedule
+from thinc.schedules import Schedule
 
 from ..util import minibatch, registry
 
-Sizing = Union[Sequence[int], int, Schedule[int]]
+SizingSchedule = Union[Iterable[int], int, Schedule]
+Sizing = Union[Iterable[int], int]
 ItemT = TypeVar("ItemT")
 BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
 
@@ -11,7 +12,7 @@
 @registry.batchers("spacy.batch_by_padded.v1")
 def configure_minibatch_by_padded_size(
     *,
-    size: Sizing,
+    size: SizingSchedule,
     buffer: int,
     discard_oversize: bool,
     get_length: Optional[Callable[[ItemT], int]] = None
@@ -21,8 +22,8 @@ def configure_minibatch_by_padded_size(
     The padded size is defined as the maximum length of sequences within the
     batch multiplied by the number of sequences in the batch.
 
-    size (int or Sequence[int]): The largest padded size to batch sequences into.
-        Can be a single integer, or a sequence, allowing for variable batch sizes.
+    size (int, Iterable[int] or Schedule): The largest padded size to batch sequences
+        into. Can be a single integer, or a sequence, allowing for variable batch sizes.
     buffer (int): The number of sequences to accumulate before sorting by length.
         A larger buffer will result in more even sizing, but if the buffer is
         very large, the iteration order will be less random, which can result
@@ -36,7 +37,7 @@ def configure_minibatch_by_padded_size(
     optionals = {"get_length": get_length} if get_length is not None else {}
     return partial(
         minibatch_by_padded_size,
-        size=size,
+        size=_schedule_to_sizing(size),
         buffer=buffer,
         discard_oversize=discard_oversize,
         **optionals
@@ -46,14 +47,14 @@ def configure_minibatch_by_padded_size(
 @registry.batchers("spacy.batch_by_words.v1")
 def configure_minibatch_by_words(
     *,
-    size: Sizing,
+    size: SizingSchedule,
     tolerance: float,
     discard_oversize: bool,
     get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
     """Create a batcher that uses the "minibatch by words" strategy.
 
-    size (int or Sequence[int]): The target number of words per batch.
+    size (int, Iterable[int] or Schedule): The target number of words per batch.
         Can be a single integer, or a sequence, allowing for variable batch sizes.
     tolerance (float): What percentage of the size to allow batches to exceed.
     discard_oversize (bool): Whether to discard sequences that by themselves
@@ -64,7 +65,7 @@ def configure_minibatch_by_words(
     optionals = {"get_length": get_length} if get_length is not None else {}
     return partial(
         minibatch_by_words,
-        size=size,
+        size=_schedule_to_sizing(size),
         tolerance=tolerance,
         discard_oversize=discard_oversize,
         **optionals
@@ -73,15 +74,15 @@ def configure_minibatch_by_words(
 
 @registry.batchers("spacy.batch_by_sequence.v1")
 def configure_minibatch(
-    size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
+    size: SizingSchedule, get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
     """Create a batcher that creates batches of the specified size.
 
-    size (int or Sequence[int]): The target number of items per batch.
+    size (int, Iterable[int] or Schedule): The target number of items per batch.
         Can be a single integer, or a sequence, allowing for variable batch sizes.
     """
     optionals = {"get_length": get_length} if get_length is not None else {}
-    return partial(minibatch, size=size, **optionals)
+    return partial(minibatch, size=_schedule_to_sizing(size), **optionals)
 
 
 def minibatch_by_padded_size(
@@ -97,7 +98,7 @@ def minibatch_by_padded_size(
     The padded size is defined as the maximum length of sequences within the
     batch multiplied by the number of sequences in the batch.
 
-    size (int or Sequence[int]): The largest padded size to batch sequences into.
+    size (int or Iterable[int]): The largest padded size to batch sequences into.
     buffer (int): The number of sequences to accumulate before sorting by length.
         A larger buffer will result in more even sizing, but if the buffer is
         very large, the iteration order will be less random, which can result
@@ -108,13 +109,12 @@ def minibatch_by_padded_size(
         The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_: Iterator[int] = itertools.repeat(size)
     else:
-        assert isinstance(size, Schedule)
-        size_ = size
-    for step, outer_batch in enumerate(minibatch(seqs, size=buffer)):
+        size_ = iter(size)
+    for outer_batch in minibatch(seqs, size=buffer):
         outer_batch = list(outer_batch)
-        target_size = size_(step)
+        target_size = next(size_)
         for indices in _batch_by_length(outer_batch, target_size, get_length):
             subbatch = [outer_batch[i] for i in indices]
             padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
@@ -136,7 +136,7 @@ def minibatch_by_words(
     themselves, or be discarded if discard_oversize=True.
 
     seqs (Iterable[Sequence]): The sequences to minibatch.
-    size (int or Sequence[int]): The target number of words per batch.
+    size (int or Iterable[int]): The target number of words per batch.
         Can be a single integer, or a sequence, allowing for variable batch sizes.
     tolerance (float): What percentage of the size to allow batches to exceed.
     discard_oversize (bool): Whether to discard sequences that by themselves
@@ -145,12 +145,10 @@ def minibatch_by_words(
         item. The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_: Iterator[int] = itertools.repeat(size)
     else:
-        assert isinstance(size, Schedule)
-        size_ = size
-    step = 0
-    target_size = size_(step)
+        size_ = iter(size)
+    target_size = next(size_)
     tol_size = target_size * tolerance
     batch = []
     overflow = []
@@ -175,8 +173,7 @@ def minibatch_by_words(
         else:
             if batch:
                 yield batch
-            step += 1
-            target_size = size_(step)
+            target_size = next(size_)
             tol_size = target_size * tolerance
             batch = overflow
             batch_size = overflow_size
@@ -194,8 +191,7 @@ def minibatch_by_words(
             else:
                 if batch:
                     yield batch
-                step += 1
-                target_size = size_(step)
+                target_size = next(size_)
                 tol_size = target_size * tolerance
                 batch = [seq]
                 batch_size = n_words
@@ -232,3 +228,9 @@ def _batch_by_length(
     batches = [list(sorted(batch)) for batch in batches]
     batches.reverse()
     return batches
+
+
+def _schedule_to_sizing(size: SizingSchedule) -> Sizing:
+    if isinstance(size, Schedule):
+        return size.to_generator()
+    return size
diff --git a/spacy/util.py b/spacy/util.py
index 551f78cc969..dedcd17ea58 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1642,12 +1642,12 @@ def minibatch(items, size):
     so that batch-size can vary on each step.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_ = itertools.repeat(size)
     else:
-        size_ = size
+        size_ = iter(size)
     items = iter(items)
-    for step in itertools.count():
-        batch_size = size_(step)
+    while True:
+        batch_size = next(size_)
         batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 77216924405..8555d64ba63 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -878,14 +878,14 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
 > get_length = null
 > ```
 
-| Name               | Description                                                                                                                                                                             |
-| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `seqs`             | The sequences to minibatch. ~~Iterable[Any]~~                                                                                                                                           |
-| `size`             | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
-| `tolerance`        | What percentage of the size to allow batches to exceed. ~~float~~                                                                                                                       |
-| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~                                                                                                     |
-| `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                 |
-| **CREATES**        | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                     |
+| Name               | Description                                                                                                                                                                                       |
+| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `seqs`             | The sequences to minibatch. ~~Iterable[Any]~~                                                                                                                                                     |
+| `size`             | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~ |
+| `tolerance`        | What percentage of the size to allow batches to exceed. ~~float~~                                                                                                                                 |
+| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~                                                                                                               |
+| `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                           |
+| **CREATES**        | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                               |
 
 ### spacy.batch_by_sequence.v1 {id="batch_by_sequence",tag="registered function"}
 
@@ -900,11 +900,11 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
 
 Create a batcher that creates batches of the specified size.
 
-| Name         | Description                                                                                                                                                                             |
-| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `size`       | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
-| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                 |
-| **CREATES**  | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                     |
+| Name         | Description                                                                                                                                                                                       |
+| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `size`       | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~ |
+| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                           |
+| **CREATES**  | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                               |
 
 ### spacy.batch_by_padded.v1 {id="batch_by_padded",tag="registered function"}
 
@@ -926,7 +926,7 @@ sequences in the batch.
 
 | Name               | Description                                                                                                                                                                                                                                 |
 | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `size`             | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~                                          |
+| `size`             | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~                                |
 | `buffer`           | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. ~~int~~ |
 | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~                                                                                                                                     |
 | `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                                                                     |
@@ -1528,7 +1528,7 @@ vary on each step.
 | Name       | Description                                      |
 | ---------- | ------------------------------------------------ |
 | `items`    | The items to batch up. ~~Iterable[Any]~~         |
-| `size`     | The batch size(s). ~~Union[int, Sequence[int]]~~ |
+| `size`     | The batch size(s). ~~Union[int, Iterable[int]]~~ |
 | **YIELDS** | The batches.                                     |
 
 ### util.filter_spans {id="util.filter_spans",tag="function",version="2.1.4"}

From 07f5ae7b29847899d51cec20a79670fbb0eded52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 19 Jan 2023 09:25:34 +0100
Subject: [PATCH 039/504] Set version to v4.0.0.dev0 (#12126)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index f5ee66dae6f..1ce8a44c9a4 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,5 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.7.4"
+__version__ = "4.0.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From a91494fb386ec17fa1deff4014cd96cea634c923 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 25 Jan 2023 12:50:21 +0900
Subject: [PATCH 040/504] Refactor lexeme mem passing (#12125)

* Don't pass mem pool to new lexeme function

* Remove unused mem from function args

Two methods calling _new_lexeme, get and get_by_orth, took mem arguments
just to call the internal method. That's no longer necessary, so this
cleans it up.

* prettier formatting

* Remove more unused mem args
---
 spacy/lexeme.pyx                    |  2 +-
 spacy/tokenizer.pxd                 | 76 ++++++++---------------------
 spacy/tokenizer.pyx                 | 39 +++++++--------
 spacy/tokens/doc.pyx                |  8 +--
 spacy/tokens/retokenizer.pyx        |  4 +-
 spacy/vocab.pxd                     |  7 ++-
 spacy/vocab.pyx                     | 30 ++++--------
 website/docs/api/cython-classes.mdx | 20 ++++----
 8 files changed, 67 insertions(+), 119 deletions(-)

diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 3e63afa34ba..41fc8f1d2b1 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -57,7 +57,7 @@ cdef class Lexeme:
         """
         self.vocab = vocab
         self.orth = orth
-        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
+        self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
         if self.c.orth != orth:
             raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
 
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index c963dcbcfa4..58d30c3202f 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -31,58 +31,24 @@ cdef class Tokenizer:
 
     cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
     cdef int _apply_special_cases(self, Doc doc) except -1
-    cdef void _filter_special_spans(
-        self,
-        vector[SpanC] &original,
-        vector[SpanC] &filtered,
-        int doc_len,
-    ) nogil
-    cdef object _prepare_special_spans(
-        self,
-        Doc doc,
-        vector[SpanC] &filtered,
-    )
-    cdef int _retokenize_special_spans(
-        self,
-        Doc doc,
-        TokenC* tokens,
-        object span_data,
-    )
-    cdef int _try_specials_and_cache(
-        self,
-        hash_t key,
-        Doc tokens,
-        int* has_special,
-        bint with_special_cases,
-    ) except -1
-    cdef int _tokenize(
-        self,
-        Doc tokens,
-        str span,
-        hash_t key,
-        int* has_special,
-        bint with_special_cases,
-    ) except -1
-    cdef str _split_affixes(
-        self,
-        Pool mem,
-        str string,
-        vector[LexemeC*] *prefixes,
-        vector[LexemeC*] *suffixes, int* has_special,
-        bint with_special_cases,
-    )
-    cdef int _attach_tokens(
-        self,
-        Doc tokens,
-        str string,
-        vector[LexemeC*] *prefixes,
-        vector[LexemeC*] *suffixes, int* has_special,
-        bint with_special_cases,
-    ) except -1
-    cdef int _save_cached(
-        self,
-        const TokenC* tokens,
-        hash_t key,
-        int* has_special,
-        int n,
-    ) except -1
+    cdef void _filter_special_spans(self, vector[SpanC] &original,
+                            vector[SpanC] &filtered, int doc_len) nogil
+    cdef object _prepare_special_spans(self, Doc doc,
+                                       vector[SpanC] &filtered)
+    cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
+                                       object span_data)
+    cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
+                                     int* has_special,
+                                     bint with_special_cases) except -1
+    cdef int _tokenize(self, Doc tokens, str span, hash_t key,
+                       int* has_special, bint with_special_cases) except -1
+    cdef str _split_affixes(self, str string,
+                                vector[LexemeC*] *prefixes,
+                                vector[LexemeC*] *suffixes, int* has_special,
+                                bint with_special_cases)
+    cdef int _attach_tokens(self, Doc tokens, str string,
+                            vector[LexemeC*] *prefixes,
+                            vector[LexemeC*] *suffixes, int* has_special,
+                            bint with_special_cases) except -1
+    cdef int _save_cached(self, const TokenC* tokens, hash_t key,
+                          int* has_special, int n) except -1
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index cdb7dda7094..12a78d39fc4 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -393,22 +393,19 @@ cdef class Tokenizer:
         cdef vector[LexemeC*] suffixes
         cdef int orig_size
         orig_size = tokens.length
-        span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
+        span = self._split_affixes(span, &prefixes, &suffixes,
                                    has_special, with_special_cases)
         self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
                             with_special_cases)
         self._save_cached(&tokens.c[orig_size], orig_key, has_special,
                           tokens.length - orig_size)
 
-    cdef str _split_affixes(
-        self,
-        Pool mem,
-        str string,
-        vector[const LexemeC*] *prefixes,
-        vector[const LexemeC*] *suffixes,
-        int* has_special,
-        bint with_special_cases
-    ):
+    cdef str _split_affixes(self, str string,
+                                vector[const LexemeC*] *prefixes,
+                                vector[const LexemeC*] *suffixes,
+                                int* has_special,
+                                bint with_special_cases):
+        cdef size_t i
         cdef str prefix
         cdef str suffix
         cdef str minus_pre
@@ -426,7 +423,7 @@ cdef class Tokenizer:
                 minus_pre = string[pre_len:]
                 if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
                     string = minus_pre
-                    prefixes.push_back(self.vocab.get(mem, prefix))
+                    prefixes.push_back(self.vocab.get(prefix))
                     break
             suf_len = self.find_suffix(string[pre_len:])
             if suf_len != 0:
@@ -434,18 +431,18 @@ cdef class Tokenizer:
                 minus_suf = string[:-suf_len]
                 if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
                     string = minus_suf
-                    suffixes.push_back(self.vocab.get(mem, suffix))
+                    suffixes.push_back(self.vocab.get(suffix))
                     break
             if pre_len and suf_len and (pre_len + suf_len) <= len(string):
                 string = string[pre_len:-suf_len]
-                prefixes.push_back(self.vocab.get(mem, prefix))
-                suffixes.push_back(self.vocab.get(mem, suffix))
+                prefixes.push_back(self.vocab.get(prefix))
+                suffixes.push_back(self.vocab.get(suffix))
             elif pre_len:
                 string = minus_pre
-                prefixes.push_back(self.vocab.get(mem, prefix))
+                prefixes.push_back(self.vocab.get(prefix))
             elif suf_len:
                 string = minus_suf
-                suffixes.push_back(self.vocab.get(mem, suffix))
+                suffixes.push_back(self.vocab.get(suffix))
         return string
 
     cdef int _attach_tokens(self, Doc tokens, str string,
@@ -470,11 +467,11 @@ cdef class Tokenizer:
                 # We're always saying 'no' to spaces here -- the caller will
                 # fix up the outermost one, with reference to the original.
                 # See Issue #859
-                tokens.push_back(self.vocab.get(tokens.mem, string), False)
+                tokens.push_back(self.vocab.get(string), False)
             else:
                 matches = self.find_infix(string)
                 if not matches:
-                    tokens.push_back(self.vocab.get(tokens.mem, string), False)
+                    tokens.push_back(self.vocab.get(string), False)
                 else:
                     # Let's say we have dyn-o-mite-dave - the regex finds the
                     # start and end positions of the hyphens
@@ -489,7 +486,7 @@ cdef class Tokenizer:
 
                         if infix_start != start:
                             span = string[start:infix_start]
-                            tokens.push_back(self.vocab.get(tokens.mem, span), False)
+                            tokens.push_back(self.vocab.get(span), False)
 
                         if infix_start != infix_end:
                             # If infix_start != infix_end, it means the infix
@@ -497,11 +494,11 @@ cdef class Tokenizer:
                             # for tokenization in some languages (see
                             # https://github.com/explosion/spaCy/issues/768)
                             infix_span = string[infix_start:infix_end]
-                            tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
+                            tokens.push_back(self.vocab.get(infix_span), False)
                         start = infix_end
                     span = string[start:]
                     if span:
-                        tokens.push_back(self.vocab.get(tokens.mem, span), False)
+                        tokens.push_back(self.vocab.get(span), False)
         cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
         while it != suffixes.rend():
             lexeme = deref(it)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 09dc94297f0..56ee216d17f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -302,12 +302,12 @@ cdef class Doc:
         cdef const LexemeC* lexeme
         for word, has_space in zip(words, spaces):
             if isinstance(word, str):
-                lexeme = self.vocab.get(self.mem, word)
+                lexeme = self.vocab.get(word)
             elif isinstance(word, bytes):
                 raise ValueError(Errors.E028.format(value=word))
             else:
                 try:
-                    lexeme = self.vocab.get_by_orth(self.mem, word)
+                    lexeme = self.vocab.get_by_orth(word)
                 except TypeError:
                     raise TypeError(Errors.E1022.format(wtype=type(word)))
             self.push_back(lexeme, has_space)
@@ -1475,7 +1475,7 @@ cdef class Doc:
             end = start + attrs[i, 0]
             has_space = attrs[i, 1]
             orth_ = text[start:end]
-            lex = self.vocab.get(self.mem, orth_)
+            lex = self.vocab.get(orth_)
             self.push_back(lex, has_space)
             start = end + has_space
         self.from_array(msg["array_head"][2:], attrs[:, 2:])
@@ -1580,7 +1580,7 @@ cdef class Doc:
         assert words == reconstructed_words
 
         for word, has_space in zip(words, spaces):
-            lex = self.vocab.get(self.mem, word)
+            lex = self.vocab.get(word)
             self.push_back(lex, has_space)
 
         # Set remaining token-level attributes via Doc.from_array().
diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx
index d3e9c5674cc..c0052ca9a9a 100644
--- a/spacy/tokens/retokenizer.pyx
+++ b/spacy/tokens/retokenizer.pyx
@@ -220,7 +220,7 @@ def _merge(Doc doc, merges):
             if doc.vocab.vectors_length > 0:
                 doc.vocab.set_vector(new_orth, span.vector)
         token = tokens[token_index]
-        lex = doc.vocab.get(doc.mem, new_orth)
+        lex = doc.vocab.get(new_orth)
         token.lex = lex
         # We set trailing space here too
         token.spacy = doc.c[spans[token_index].end-1].spacy
@@ -360,7 +360,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
     cdef int idx_offset = 0
     for i, orth in enumerate(orths):
         token = &doc.c[token_index + i]
-        lex = doc.vocab.get(doc.mem, orth)
+        lex = doc.vocab.get(orth)
         token.lex = lex
         # If lemma is currently set, set default lemma to orth
         if token.lemma != 0:
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index b91ce3ab45b..f9e01b186b3 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -35,12 +35,11 @@ cdef class Vocab:
     cdef public object lex_attr_getters
     cdef public object cfg
 
-    cdef const LexemeC* get(self, Pool mem, str string) except NULL
-    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
+    cdef const LexemeC* get(self, str string) except NULL
+    cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL
     cdef const TokenC* make_fused_token(self, substrings) except NULL
 
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
+    cdef const LexemeC* _new_lexeme(self, str string) except NULL
     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
 
     cdef PreshMap _by_orth
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 834f21c35dc..8ac1215dead 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -145,7 +145,7 @@ cdef class Vocab:
         self.lex_attr_getters[flag_id] = flag_getter
         return flag_id
 
-    cdef const LexemeC* get(self, Pool mem, str string) except NULL:
+    cdef const LexemeC* get(self, str string) except NULL:
         """Get a pointer to a `LexemeC` from the lexicon, creating a new
         `Lexeme` if necessary using memory acquired from the given pool. If the
         pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@@ -162,9 +162,9 @@ cdef class Vocab:
                                                   orth=key, orth_id=string))
             return lex
         else:
-            return self._new_lexeme(mem, string)
+            return self._new_lexeme(string)
 
-    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
+    cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL:
         """Get a pointer to a `LexemeC` from the lexicon, creating a new
         `Lexeme` if necessary using memory acquired from the given pool. If the
         pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@@ -176,21 +176,10 @@ cdef class Vocab:
         if lex != NULL:
             return lex
         else:
-            return self._new_lexeme(mem, self.strings[orth])
-
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
-        # I think this heuristic is bad, and the Vocab should always
-        # own the lexemes. It avoids weird bugs this way, as it's how the thing
-        # was originally supposed to work. The best solution to the growing
-        # memory use is to periodically reset the vocab, which is an action
-        # that should be up to the user to do (so we don't need to keep track
-        # of the doc ownership).
-        # TODO: Change the C API so that the mem isn't passed in here.
-        mem = self.mem
-        # if len(string) < 3 or self.length < 10000:
-        #    mem = self.mem
-        cdef bint is_oov = mem is not self.mem
-        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
+            return self._new_lexeme(self.strings[orth])
+
+    cdef const LexemeC* _new_lexeme(self, str string) except NULL:
+        lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
         lex.orth = self.strings.add(string)
         lex.length = len(string)
         if self.vectors is not None and hasattr(self.vectors, "key2row"):
@@ -204,8 +193,7 @@ cdef class Vocab:
                     value = self.strings.add(value)
                 if value is not None:
                     Lexeme.set_struct_attr(lex, attr, value)
-        if not is_oov:
-            self._add_lex_to_vocab(lex.orth, lex)
+        self._add_lex_to_vocab(lex.orth, lex)
         if lex == NULL:
             raise ValueError(Errors.E085.format(string=string))
         return lex
@@ -276,7 +264,7 @@ cdef class Vocab:
             props = intify_attrs(props, strings_map=self.strings)
             token = &tokens[i]
             # Set the special tokens up to have arbitrary attributes
-            lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
+            lex = <LexemeC*>self.get_by_orth(props[ORTH])
             token.lex = lex
             for attr_id, value in props.items():
                 Token.set_struct_attr(token, attr_id, value)
diff --git a/website/docs/api/cython-classes.mdx b/website/docs/api/cython-classes.mdx
index ce7c03940ac..88bd92c723b 100644
--- a/website/docs/api/cython-classes.mdx
+++ b/website/docs/api/cython-classes.mdx
@@ -163,14 +163,13 @@ vocabulary.
 > #### Example
 >
 > ```python
-> lexeme = vocab.get(vocab.mem, "hello")
+> lexeme = vocab.get("hello")
 > ```
 
-| Name        | Description                                                                                                |
-| ----------- | ---------------------------------------------------------------------------------------------------------- |
-| `mem`       | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
-| `string`    | The string of the word to look up. ~~str~~                                                                 |
-| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~                                                          |
+| Name        | Description                                       |
+| ----------- | ------------------------------------------------- |
+| `string`    | The string of the word to look up. ~~str~~        |
+| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
 
 ### Vocab.get_by_orth {id="vocab_get_by_orth",tag="method"}
 
@@ -183,11 +182,10 @@ vocabulary.
 > lexeme = vocab.get_by_orth(doc[0].lex.norm)
 > ```
 
-| Name        | Description                                                                                                |
-| ----------- | ---------------------------------------------------------------------------------------------------------- |
-| `mem`       | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
-| `orth`      | ID of the verbatim text content. ~~attr_t (uint64_t)~~                                                     |
-| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~                                                          |
+| Name        | Description                                            |
+| ----------- | ------------------------------------------------------ |
+| `orth`      | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
+| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~      |
 
 ## StringStore {id="stringstore",tag="cdef class",source="spacy/strings.pxd"}
 

From f6be97887459813e979f12cffa6b1ccdc91e68bf Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 08:29:46 +0100
Subject: [PATCH 041/504] Format

---
 spacy/pipeline/edit_tree_lemmatizer.py |  2 +-
 spacy/pipeline/entity_linker.py        | 12 ++++++++++--
 spacy/pipeline/ner.py                  |  7 +++++--
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index d5169178b8c..a1bcb98455c 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -167,7 +167,7 @@ def get_teacher_student_loss(
         student_scores: Scores representing the student model's predictions.
 
         RETURNS (Tuple[float, float]): The loss and the gradient.
-        
+
         DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
         """
         loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index bab79282d5b..9c4312f6dd8 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -463,7 +463,11 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         docs_ents: List[Ragged] = []
         docs_scores: List[Ragged] = []
         if not docs:
-            return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
+            return {
+                KNOWLEDGE_BASE_IDS: final_kb_ids,
+                "ents": docs_ents,
+                "scores": docs_scores,
+            }
         if isinstance(docs, Doc):
             docs = [docs]
         for doc in docs:
@@ -565,7 +569,11 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                 method="predict", msg="result variables not of equal length"
             )
             raise RuntimeError(err)
-        return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
+        return {
+            KNOWLEDGE_BASE_IDS: final_kb_ids,
+            "ents": docs_ents,
+            "scores": docs_scores,
+        }
 
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of documents, using pre-computed scores.
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.py
index 4c2a3ac093c..2c5fd89cc5d 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.py
@@ -260,8 +260,11 @@ def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
     def labels(self):
         # Get the labels from the model by looking at the available moves, e.g.
         # B-PERSON, I-PERSON, L-PERSON, U-PERSON
-        labels = set(remove_bilu_prefix(move) for move in self.move_names
-                     if move[0] in ("B", "I", "L", "U"))
+        labels = set(
+            remove_bilu_prefix(move)
+            for move in self.move_names
+            if move[0] in ("B", "I", "L", "U")
+        )
         return tuple(sorted(labels))
 
     def scored_ents(self, beams):

From 388c34f36b0041f2d58114b72d318a02665ab813 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 08:37:02 +0100
Subject: [PATCH 042/504] CI: Skip tests that require published pipelines

---
 .github/azure-steps.yml | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index c7722391fec..fc83d4994b4 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -64,12 +64,17 @@ steps:
     displayName: "Run GPU tests"
     condition: eq(${{ parameters.gpu }}, true)
 
-  - script: |
-      python -m spacy download ca_core_news_sm
-      python -m spacy download ca_core_news_md
-      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-    displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.8')
+#  - script: |
+#      python -m spacy download ca_core_news_sm
+#      python -m spacy download ca_core_news_md
+#      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+#    displayName: 'Test download CLI'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+#    displayName: 'Test no warnings on load (#11713)'
+#    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@@ -93,17 +98,17 @@ steps:
     displayName: 'Test train CLI'
     condition: eq(variables['python_version'], '3.8')
 
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-    displayName: 'Test assemble CLI'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-    displayName: 'Test assemble CLI vectors warning'
-    condition: eq(variables['python_version'], '3.8')
+#  - script: |
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+#      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+#    displayName: 'Test assemble CLI'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+#      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+#    displayName: 'Test assemble CLI vectors warning'
+#    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python .github/validate_universe_json.py website/meta/universe.json

From c90310ea4a9b1c8778637ac0c217e98bf6503a44 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 15:48:20 +0100
Subject: [PATCH 043/504] Drop python 3.6/3.7, remove unneeded compat (#12187)

* Drop python 3.6/3.7, remove unneeded compat

* Remove unused import

* Minimal python 3.8+ docs updates
---
 .pre-commit-config.yaml         |  2 +-
 CONTRIBUTING.md                 |  2 +-
 README.md                       |  2 +-
 azure-pipelines.yml             | 20 +----------
 requirements.txt                |  5 ++-
 setup.cfg                       |  6 ++--
 spacy/cli/_util.py              | 10 ++++++
 spacy/cli/debug_data.py         |  8 +++++
 spacy/compat.py                 | 13 -------
 spacy/errors.py                 |  3 +-
 spacy/language.py               | 61 +++++++++++++++------------------
 spacy/matcher/matcher.pyi       | 17 ++-------
 spacy/matcher/phrasematcher.pyi |  7 ++--
 spacy/ml/models/parser.py       |  5 +--
 spacy/pipeline/spancat.py       |  9 +++--
 spacy/schemas.py                |  9 +++++
 spacy/ty.py                     | 16 ++-------
 spacy/util.py                   | 11 +++---
 website/docs/usage/index.mdx    |  2 +-
 19 files changed, 87 insertions(+), 121 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e2c5e98fd97..8efe733f904 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
     rev: 22.3.0
     hooks:
     - id: black
-      language_version: python3.7
+      language_version: python3.8
       additional_dependencies: ['click==8.0.4']
 -   repo: https://github.com/pycqa/flake8
     rev: 5.0.4
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ed75e1fd8bd..b85ea8fcc4d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -276,7 +276,7 @@ except:  # noqa: E722
 
 ### Python conventions
 
-All Python code must be written **compatible with Python 3.6+**. More detailed
+All Python code must be written **compatible with Python 3.8+**. More detailed
 code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md).
 
 #### I/O and handling paths
diff --git a/README.md b/README.md
index afa96363b65..9e5c4be6898 100644
--- a/README.md
+++ b/README.md
@@ -115,7 +115,7 @@ For detailed installation instructions, see the
 
 - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
   Studio)
-- **Python version**: Python 3.7+ (only 64 bit)
+- **Python version**: Python 3.8+ (only 64 bit)
 - **Package managers**: [pip] · [conda] (via `conda-forge`)
 
 [pip]: https://pypi.org/project/spacy/
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 0f7ea91f96f..99f1b8afffe 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -29,7 +29,7 @@ jobs:
     steps:
       - task: UsePythonVersion@0
         inputs:
-          versionSpec: "3.7"
+          versionSpec: "3.8"
       - script: |
           pip install flake8==5.0.4
           python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
@@ -40,24 +40,6 @@ jobs:
     strategy:
       matrix:
         # We're only running one platform per Python version to speed up builds
-        Python36Linux:
-          imageName: "ubuntu-20.04"
-          python.version: "3.6"
-        #        Python36Windows:
-        #          imageName: "windows-latest"
-        #          python.version: "3.6"
-        #        Python36Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.6"
-        #        Python37Linux:
-        #          imageName: "ubuntu-20.04"
-        #          python.version: "3.7"
-        Python37Windows:
-          imageName: "windows-latest"
-          python.version: "3.7"
-        #        Python37Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.7"
         #        Python38Linux:
         #          imageName: "ubuntu-latest"
         #          python.version: "3.8"
diff --git a/requirements.txt b/requirements.txt
index 6824be1cee3..6ad10b1d1c4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=3.0.11,<3.1.0
+spacy-legacy>=4.0.0.dev0,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
@@ -23,7 +23,6 @@ langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
-typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
@@ -32,7 +31,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
+mypy>=0.990,<0.1000; platform_machine != "aarch64"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
diff --git a/setup.cfg b/setup.cfg
index 9a9b530071b..39ca2dfa743 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -17,7 +17,6 @@ classifiers =
     Operating System :: Microsoft :: Windows
     Programming Language :: Cython
     Programming Language :: Python :: 3
-    Programming Language :: Python :: 3.7
     Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9
     Programming Language :: Python :: 3.10
@@ -30,10 +29,10 @@ project_urls =
 [options]
 zip_safe = false
 include_package_data = true
-python_requires = >=3.6
+python_requires = >=3.8
 install_requires =
     # Our libraries
-    spacy-legacy>=3.0.11,<3.1.0
+    spacy-legacy>=4.0.0.dev0,<4.1.0
     spacy-loggers>=1.0.0,<2.0.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
@@ -55,7 +54,6 @@ install_requires =
     # Official Python utilities
     setuptools
     packaging>=20.0
-    typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
     langcodes>=3.2.0,<4.0.0
 
 [options.entry_points]
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index fa41e6a08e0..ea91e64247d 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,3 +1,10 @@
+from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
+from typing import TYPE_CHECKING, overload
+import sys
+import shutil
+from pathlib import Path
+from wasabi import msg, Printer
+import srsly
 import hashlib
 import os
 import shutil
@@ -27,6 +34,9 @@
 from wasabi import Printer, msg
 from weasel import app as project_cli
 
+from ..schemas import ProjectConfigSchema, validate
+from ..util import import_file, run_command, make_tempdir, registry, logger
+from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
 from .. import about
 from ..compat import Literal
 from ..schemas import validate
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index af3c24f3ba9..c2253b0cb70 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,3 +1,11 @@
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
+from typing import Literal, cast, overload
+from pathlib import Path
+from collections import Counter
+import sys
+import srsly
+from wasabi import Printer, MESSAGES, msg
+import typer
 import math
 import sys
 from collections import Counter
diff --git a/spacy/compat.py b/spacy/compat.py
index 522fa30ddde..1e63807a0e8 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,19 +23,6 @@
 except ImportError:
     cupy = None
 
-if sys.version_info[:2] >= (3, 8):  # Python 3.8+
-    from typing import Literal, Protocol, runtime_checkable
-else:
-    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
-
-# Important note: The importlib_metadata "backport" includes functionality
-# that's not part of the built-in importlib.metadata. We should treat this
-# import like the built-in and only use what's available there.
-try:  # Python 3.8+
-    import importlib.metadata as importlib_metadata
-except ImportError:
-    from catalogue import _importlib_metadata as importlib_metadata  # type: ignore[no-redef]    # noqa: F401
-
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/errors.py b/spacy/errors.py
index 9074a3fead8..dcf8e60b7a1 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,7 +1,6 @@
+from typing import Literal
 import warnings
 
-from .compat import Literal
-
 
 class ErrorsWithCodes(type):
     def __getattribute__(self, code):
diff --git a/spacy/language.py b/spacy/language.py
index a47cc5df454..161d5b64884 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,3 +1,10 @@
+from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Literal
+from typing import Union, Tuple, List, Set, Pattern, Sequence
+from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
+
+from dataclasses import dataclass
+import random
+import itertools
 import functools
 import inspect
 import itertools
@@ -30,43 +37,29 @@
     overload,
 )
 
-import srsly
-from thinc.api import Config, CupyOps, Optimizer, get_current_ops
-
-from . import about, ty, util
-from .compat import Literal
+from . import ty
+from .tokens.underscore import Underscore
+from .vocab import Vocab, create_vocab
+from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
+from .training import Example, validate_examples
+from .training.initialize import init_vocab, init_tok2vec
+from .scorer import Scorer
+from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
+from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
+from .util import warn_if_jupyter_cupy
+from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
+from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .lang.punctuation import TOKENIZER_INFIXES
+from .tokens import Doc
+from .tokenizer import Tokenizer
 from .errors import Errors, Warnings
+from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
+from .schemas import ConfigSchemaPretrain, validate_init_settings
 from .git_info import GIT_VERSION
-from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
-from .lang.tokenizer_exceptions import BASE_EXCEPTIONS, URL_MATCH
+from . import util
+from . import about
 from .lookups import load_lookups
-from .pipe_analysis import analyze_pipes, print_pipe_analysis, validate_attrs
-from .schemas import (
-    ConfigSchema,
-    ConfigSchemaInit,
-    ConfigSchemaNlp,
-    ConfigSchemaPretrain,
-    validate_init_settings,
-)
-from .scorer import Scorer
-from .tokenizer import Tokenizer
-from .tokens import Doc
-from .tokens.underscore import Underscore
-from .training import Example, validate_examples
-from .training.initialize import init_tok2vec, init_vocab
-from .util import (
-    _DEFAULT_EMPTY_PIPES,
-    CONFIG_SECTION_ORDER,
-    SimpleFrozenDict,
-    SimpleFrozenList,
-    _pipe,
-    combine_score_weights,
-    raise_error,
-    registry,
-    warn_if_jupyter_cupy,
-)
-from .vectors import BaseVectors
-from .vocab import Vocab, create_vocab
+
 
 PipeCallable = Callable[[Doc], Doc]
 
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index c33b534cbd2..a0b6d91e7d5 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -1,17 +1,6 @@
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    Iterator,
-    List,
-    Optional,
-    Tuple,
-    Union,
-    overload,
-)
-
-from ..compat import Literal
+from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Literal
+from typing import Iterator, Iterable, overload
+from ..vocab import Vocab
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index f9585da7893..45685db228a 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -1,6 +1,7 @@
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, overload
-
-from ..compat import Literal
+from typing import List, Tuple, Union, Optional, Callable, Any, Dict, Literal
+from typing import overload
+from .matcher import Matcher
+from ..vocab import Vocab
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 from .matcher import Matcher
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 59483839206..01312983d86 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,12 +1,9 @@
-from typing import Optional, List, Tuple, Any
+from typing import Optional, List, Tuple, Any, Literal
 from thinc.types import Floats2d
 from thinc.api import Model
 import warnings
 
 from ...errors import Errors, Warnings
-from ...compat import Literal
-from ...errors import Errors
-from ...tokens import Doc
 from ...util import registry
 from ..tb_framework import TransitionModel
 from ...tokens.doc import Doc
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 1450bb5d6cb..bfaaf82e8d0 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,5 +1,5 @@
 from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
-from typing import Union
+from typing import Union, Protocol, runtime_checkable
 from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
 from thinc.api import Optimizer
 from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
@@ -8,7 +8,12 @@
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
 from thinc.types import Floats2d, Ints1d, Ints2d, Ragged
 
-from ..compat import Protocol, runtime_checkable
+from ..scorer import Scorer
+from ..language import Language
+from .trainable_pipe import TrainablePipe
+from ..tokens import Doc, SpanGroup, Span
+from ..vocab import Vocab
+from ..training import Example, validate_examples
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 9a2b5ed60e9..831f7df058f 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,3 +1,12 @@
+from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
+from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
+from enum import Enum
+from pydantic import BaseModel, Field, ValidationError, validator, create_model
+from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
+from pydantic.main import ModelMetaclass
+from thinc.api import Optimizer, ConfigValidationError, Model
+from thinc.config import Promise
+from collections import defaultdict
 import inspect
 import re
 from collections import defaultdict
diff --git a/spacy/ty.py b/spacy/ty.py
index f389456c03e..5a2b44aa583 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -1,17 +1,5 @@
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Sequence,
-)
-
-from thinc.api import Model, Optimizer
-
-from .compat import Protocol, runtime_checkable
+from typing import TYPE_CHECKING, Protocol, runtime_checkable
+from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
 
 if TYPE_CHECKING:
     from .language import Language
diff --git a/spacy/util.py b/spacy/util.py
index dedcd17ea58..de04ee6e718 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1,5 +1,6 @@
 import functools
 import importlib
+import importlib.metadata
 import importlib.util
 import re
 from pathlib import Path
@@ -70,7 +71,7 @@
 
 
 from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows, importlib_metadata
+from .compat import cupy, CudaStream, is_windows
 from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, importlib_metadata, is_windows
@@ -748,8 +749,8 @@ def get_package_version(name: str) -> Optional[str]:
     RETURNS (str / None): The version or None if package not installed.
     """
     try:
-        return importlib_metadata.version(name)  # type: ignore[attr-defined]
-    except importlib_metadata.PackageNotFoundError:  # type: ignore[attr-defined]
+        return importlib.metadata.version(name)  # type: ignore[attr-defined]
+    except importlib.metadata.PackageNotFoundError:  # type: ignore[attr-defined]
         return None
 
 
@@ -937,7 +938,7 @@ def is_package(name: str) -> bool:
     RETURNS (bool): True if installed package, False if not.
     """
     try:
-        importlib_metadata.distribution(name)  # type: ignore[attr-defined]
+        importlib.metadata.distribution(name)  # type: ignore[attr-defined]
         return True
     except:  # noqa: E722
         return False
@@ -1777,7 +1778,7 @@ def packages_distributions() -> Dict[str, List[str]]:
     it's not available in the builtin importlib.metadata.
     """
     pkg_to_dist = defaultdict(list)
-    for dist in importlib_metadata.distributions():
+    for dist in importlib.metadata.distributions():
         for pkg in (dist.read_text("top_level.txt") or "").split():
             pkg_to_dist[pkg].append(dist.metadata["Name"])
     return dict(pkg_to_dist)
diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx
index c50e9db6c6b..b8b4917f2b2 100644
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@@ -20,7 +20,7 @@ menu:
 
 ## Installation instructions {id="installation"}
 
-spaCy is compatible with **64-bit CPython 3.7+** and runs on **Unix/Linux**,
+spaCy is compatible with **64-bit CPython 3.8+** and runs on **Unix/Linux**,
 **macOS/OS X** and **Windows**. The latest spaCy releases are available over
 [pip](https://pypi.python.org/pypi/spacy) and
 [conda](https://anaconda.org/conda-forge/spacy).

From d783c45fa065a8607efc3e850e2c9f3b3dc962ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 30 Jan 2023 12:44:11 +0100
Subject: [PATCH 044/504] Add `Language.distill` (#12116)

* Add `Language.distill`

This method is the distillation counterpart of `Language.update`.  It
takes a teacher `Language` instance and distills the student pipes on
the teacher pipes.

* Apply suggestions from code review

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Clarify that how Example is used in distillation

* Update transition parser distill docstring for examples argument

* Pass optimizer to `TrainablePipe.distill`

* Annotate pipe before update

As discussed internally, we want to let a pipe annotate before doing an
update with gold/silver data. Otherwise, the output may be (too)
informed by the gold/silver data.

* Rename `component_map` to `student_to_teacher`

* Better synopsis in `Language.distill` docstring

* `name` -> `student_name`

* Fix labels type in docstring

* Mark distill test as slow

* Fix `student_to_teacher` type in docs

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/language.py                       | 108 +++++++++++++++++++++++-
 spacy/pipeline/trainable_pipe.pyx       |   4 +-
 spacy/pipeline/transition_parser.pyx    |   4 +-
 spacy/tests/test_language.py            |  69 +++++++++++++++
 spacy/ty.py                             |  19 +++++
 website/docs/api/dependencyparser.mdx   |  18 ++--
 website/docs/api/edittreelemmatizer.mdx |  18 ++--
 website/docs/api/entityrecognizer.mdx   |  18 ++--
 website/docs/api/language.mdx           |  28 ++++++
 website/docs/api/morphologizer.mdx      |  18 ++--
 website/docs/api/pipe.mdx               |  18 ++--
 website/docs/api/sentencerecognizer.mdx |  18 ++--
 website/docs/api/tagger.mdx             |  18 ++--
 13 files changed, 290 insertions(+), 68 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 161d5b64884..8cd439d10b1 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -41,7 +41,7 @@
 from .tokens.underscore import Underscore
 from .vocab import Vocab, create_vocab
 from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .training import Example, validate_examples
+from .training import Example, validate_examples, validate_distillation_examples
 from .training.initialize import init_vocab, init_tok2vec
 from .scorer import Scorer
 from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
@@ -1049,6 +1049,102 @@ def __call__(
                 raise ValueError(Errors.E005.format(name=name, returned_type=type(doc)))
         return doc
 
+    def distill(
+        self,
+        teacher: "Language",
+        examples: Iterable[Example],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
+        exclude: Iterable[str] = SimpleFrozenList(),
+        annotates: Iterable[str] = SimpleFrozenList(),
+        student_to_teacher: Optional[Dict[str, str]] = None,
+    ):
+        """Distill the models in a student pipeline from a teacher pipeline.
+        teacher (Language): Teacher to distill from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
+        drop (float): The dropout rate.
+        sgd (Optional[Optimizer]): An optimizer.
+        losses (Optional(Dict[str, float])): Dictionary to update with the loss,
+            keyed by component.
+        component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
+            for specific pipeline components, keyed by component name.
+        exclude (Iterable[str]): Names of components that shouldn't be updated.
+        annotates (Iterable[str]): Names of components that should set
+            annotations on the predicted examples after updating.
+        student_to_teacher (Optional[Dict[str, str]]): Map student pipe name to
+            teacher pipe name, only needed for pipes where the student pipe
+            name does not match the teacher pipe name.
+        RETURNS (Dict[str, float]): The updated losses dictionary
+
+        DOCS: https://spacy.io/api/language#distill
+        """
+        if student_to_teacher is None:
+            student_to_teacher = {}
+        if losses is None:
+            losses = {}
+        if isinstance(examples, list) and len(examples) == 0:
+            return losses
+
+        validate_distillation_examples(examples, "Language.distill")
+        examples = _copy_examples(examples)
+
+        if sgd is None:
+            if self._optimizer is None:
+                self._optimizer = self.create_optimizer()
+            sgd = self._optimizer
+
+        if component_cfg is None:
+            component_cfg = {}
+        pipe_kwargs = {}
+        for student_name, student_proc in self.pipeline:
+            component_cfg.setdefault(student_name, {})
+            pipe_kwargs[student_name] = deepcopy(component_cfg[student_name])
+            component_cfg[student_name].setdefault("drop", drop)
+            pipe_kwargs[student_name].setdefault("batch_size", self.batch_size)
+
+        teacher_pipes = dict(teacher.pipeline)
+        for student_name, student_proc in self.pipeline:
+            if student_name in annotates:
+                for doc, eg in zip(
+                    _pipe(
+                        (eg.predicted for eg in examples),
+                        proc=student_proc,
+                        name=student_name,
+                        default_error_handler=self.default_error_handler,
+                        kwargs=pipe_kwargs[student_name],
+                    ),
+                    examples,
+                ):
+                    eg.predicted = doc
+
+            if (
+                student_name not in exclude
+                and isinstance(student_proc, ty.DistillableComponent)
+                and student_proc.is_distillable
+            ):
+                # A missing teacher pipe is not an error, some student pipes
+                # do not need a teacher, such as tok2vec layer losses.
+                teacher_name = (
+                    student_to_teacher[student_name]
+                    if student_name in student_to_teacher
+                    else student_name
+                )
+                teacher_pipe = teacher_pipes.get(teacher_name, None)
+                student_proc.distill(
+                    teacher_pipe,
+                    examples,
+                    sgd=sgd,
+                    losses=losses,
+                    **component_cfg[student_name],
+                )
+
+        return losses
+
     def disable_pipes(self, *names) -> "DisabledPipes":
         """Disable one or more pipeline components. If used as a context
         manager, the pipeline will be restored to the initial state at the end
@@ -1274,12 +1370,16 @@ def initialize(
         self,
         get_examples: Optional[Callable[[], Iterable[Example]]] = None,
         *,
+        labels: Optional[Dict[str, Any]] = None,
         sgd: Optional[Optimizer] = None,
     ) -> Optimizer:
         """Initialize the pipe for training, using data examples if available.
 
         get_examples (Callable[[], Iterable[Example]]): Optional function that
             returns gold-standard Example objects.
+        labels (Optional[Dict[str, Any]]): Labels to pass to pipe initialization,
+            using the names of the pipes as keys. Overrides labels that are in
+            the model configuration.
         sgd (Optional[Optimizer]): An optimizer to use for updates. If not
             provided, will be created using the .create_optimizer() method.
         RETURNS (thinc.api.Optimizer): The optimizer.
@@ -1327,6 +1427,8 @@ def get_examples():
         for name, proc in self.pipeline:
             if isinstance(proc, ty.InitializableComponent):
                 p_settings = I["components"].get(name, {})
+                if labels is not None and name in labels:
+                    p_settings["labels"] = labels[name]
                 p_settings = validate_init_settings(
                     proc.initialize, p_settings, section="components", name=name
                 )
@@ -1800,6 +1902,7 @@ def from_config(
         # using the nlp.config with all defaults.
         config = util.copy_config(config)
         orig_pipeline = config.pop("components", {})
+        orig_distill = config.pop("distill", None)
         orig_pretraining = config.pop("pretraining", None)
         config["components"] = {}
         if auto_fill:
@@ -1808,6 +1911,9 @@ def from_config(
             filled = config
         filled["components"] = orig_pipeline
         config["components"] = orig_pipeline
+        if orig_distill is not None:
+            filled["distill"] = orig_distill
+            config["distill"] = orig_distill
         if orig_pretraining is not None:
             filled["pretraining"] = orig_pretraining
             config["pretraining"] = orig_pretraining
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 3ec3e7551aa..97442a1aa97 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -74,8 +74,8 @@ cdef class TrainablePipe(Pipe):
         teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
             from.
         examples (Iterable[Example]): Distillation examples. The reference
-            and predicted docs must have the same number of tokens and the
-            same orthography.
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
         drop (float): dropout rate.
         sgd (Optional[Optimizer]): An optimizer. Will be created via
             create_optimizer if not set.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index d71a4ab0355..6a50dbacaeb 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -233,8 +233,8 @@ class Parser(TrainablePipe):
         teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
             from.
         examples (Iterable[Example]): Distillation examples. The reference
-            and predicted docs must have the same number of tokens and the
-            same orthography.
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
         drop (float): dropout rate.
         sgd (Optional[Optimizer]): An optimizer. Will be created via
             create_optimizer if not set.
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index d229739e1ee..8138cb157d2 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -28,6 +28,12 @@
     pass
 
 
+TAGGER_TRAIN_DATA = [
+    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
+
 def evil_component(doc):
     if "2" in doc.text:
         raise ValueError("no dice")
@@ -805,3 +811,66 @@ def bad_pipe(doc):
     nlp.add_pipe("test_component_bad_pipe")
     with pytest.raises(ValueError, match="instead of a Doc"):
         nlp("text")
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("teacher_tagger_name", ["tagger", "teacher_tagger"])
+def test_distill(teacher_tagger_name):
+    teacher = English()
+    teacher_tagger = teacher.add_pipe("tagger", name=teacher_tagger_name)
+    train_examples = []
+    for t in TAGGER_TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses[teacher_tagger_name] < 0.00001
+
+    student = English()
+    student_tagger = student.add_pipe("tagger")
+    student_tagger.min_tree_freq = 1
+    student_tagger.initialize(
+        get_examples=lambda: train_examples, labels=teacher_tagger.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TAGGER_TRAIN_DATA
+    ]
+
+    student_to_teacher = (
+        None
+        if teacher_tagger.name == student_tagger.name
+        else {student_tagger.name: teacher_tagger.name}
+    )
+
+    for i in range(50):
+        losses = {}
+        student.distill(
+            teacher,
+            distill_examples,
+            sgd=optimizer,
+            losses=losses,
+            student_to_teacher=student_to_teacher,
+        )
+    assert losses["tagger"] < 0.00001
+
+    test_text = "I like blue eggs"
+    doc = student(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+    # Do an extra update to check if annotates works, though we can't really
+    # validate the resuls, since the annotations are ephemeral.
+    student.distill(
+        teacher,
+        distill_examples,
+        sgd=optimizer,
+        losses=losses,
+        student_to_teacher=student_to_teacher,
+        annotates=["tagger"],
+    )
diff --git a/spacy/ty.py b/spacy/ty.py
index 5a2b44aa583..ac09cb336ac 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -25,6 +25,25 @@ def finish_update(self, sgd: Optimizer) -> None:
         ...
 
 
+@runtime_checkable
+class DistillableComponent(Protocol):
+    is_distillable: bool
+
+    def distill(
+        self,
+        teacher_pipe: Optional[TrainableComponent],
+        examples: Iterable["Example"],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None
+    ) -> Dict[str, float]:
+        ...
+
+    def finish_update(self, sgd: Optimizer) -> None:
+        ...
+
+
 @runtime_checkable
 class InitializableComponent(Protocol):
     def initialize(
diff --git a/website/docs/api/dependencyparser.mdx b/website/docs/api/dependencyparser.mdx
index 5179ce48b84..296d6d87da5 100644
--- a/website/docs/api/dependencyparser.mdx
+++ b/website/docs/api/dependencyparser.mdx
@@ -154,15 +154,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## DependencyParser.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx
index 2e099365758..c8b5c71806b 100644
--- a/website/docs/api/edittreelemmatizer.mdx
+++ b/website/docs/api/edittreelemmatizer.mdx
@@ -138,15 +138,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## EditTreeLemmatizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/entityrecognizer.mdx b/website/docs/api/entityrecognizer.mdx
index 005d5d11deb..f503cc998b0 100644
--- a/website/docs/api/entityrecognizer.mdx
+++ b/website/docs/api/entityrecognizer.mdx
@@ -150,15 +150,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## EntityRecognizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index d5fbae05ec4..2a1f7a1a961 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -333,6 +333,34 @@ and custom registered functions if needed. See the
 | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
 | **RETURNS**     | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                          |
 
+## Language.distill {id="distill",tag="method,experimental",version="4"}
+
+Distill the models in a student pipeline from a teacher pipeline.
+
+> #### Example
+>
+> ```python
+>
+> teacher = spacy.load("en_core_web_lg")
+> student = English()
+> student.add_pipe("tagger")
+> student.distill(teacher, examples, sgd=optimizer)
+> ```
+
+| Name                 | Description                                                                                                                                                                                 |
+| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher`            | The teacher pipeline to distill from. ~~Language~~                                                                                                                                          |
+| `examples`           | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_       |                                                                                                                                                                                             |
+| `drop`               | The dropout rate. ~~float~~                                                                                                                                                                 |
+| `sgd`                | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`             | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                                                             |
+| `component_cfg`      | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                              |
+| `exclude`            | Names of components that shouldn't be updated. Defaults to `[]`. ~~Iterable[str]~~                                                                                                          |
+| `annotates`          | Names of components that should set annotations on the prediced examples after updating. Defaults to `[]`. ~~Iterable[str]~~                                                                |
+| `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                              |
+| **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+
 ## Language.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 4f79458d319..4660ec312fa 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -144,15 +144,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## Morphologizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx
index 120c8f6908f..e1e7f5d7021 100644
--- a/website/docs/api/pipe.mdx
+++ b/website/docs/api/pipe.mdx
@@ -257,15 +257,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## TrainablePipe.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx
index 02fd57102e2..dfb7ed308ba 100644
--- a/website/docs/api/sentencerecognizer.mdx
+++ b/website/docs/api/sentencerecognizer.mdx
@@ -129,15 +129,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## SentenceRecognizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index 664fd7940c1..35e7a23b174 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -128,15 +128,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## Tagger.pipe {id="pipe",tag="method"}
 

From 6ac73ab21db70ed1cd7dfa301ebaffc6d757891c Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 31 Jan 2023 19:31:17 +0900
Subject: [PATCH 045/504] Don't re-download installed models (#12188)

* Don't re-download installed models

When downloading a model, this checks if the same version of the same
model is already installed. If it is then the download is skipped.

This is necessary because pip uses the final download URL for its
caching feature, but because of the way models are hosted on Github,
their URLs change every few minutes.

* Use importlib instead of meta.json

* Use get_package_version

* Add untested, disabled test

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 .github/azure-steps.yml |  5 +++++
 spacy/cli/download.py   | 11 ++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index fc83d4994b4..11dc7e295e4 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -74,6 +74,11 @@ steps:
 #  - script: |
 #      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
 #    displayName: 'Test no warnings on load (#11713)'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping
+#    displayName: 'Test skip re-download (#12188)'
 #    condition: eq(variables['python_version'], '3.8')
 
   - script: |
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index f371d110319..ea7a06c5417 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -8,7 +8,8 @@
 
 from .. import about
 from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version
+from ..util import is_prerelease_version, get_installed_models
+from ..util import get_package_version
 
 
 @app.command(
@@ -71,6 +72,14 @@ def download(
         compatibility = get_compatibility()
         version = get_version(model_name, compatibility)
 
+    # If we already have this version installed, skip downloading
+    installed = get_installed_models()
+    if model_name in installed:
+        installed_version = get_package_version(model_name)
+        if installed_version == version:
+            msg.warn(f"{model_name} v{version} already installed, skipping")
+            return
+
     filename = get_model_filename(model_name, version, sdist)
 
     download_model(filename, pip_args)

From 96e0b3c633864b4a3fbc59c1b7e433b96f5ca01f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 31 Jan 2023 13:06:02 +0100
Subject: [PATCH 046/504] Add the configuration schema for distillation
 (#12201)

* Add the configuration schema for distillation

This also adds the default configuration and some tests. The schema will
be used by the training loop and `distill` subcommand.

* Format

* Change distillation shortopt to -d

* Fix descripion of max_epochs

* Rename distillation flag to -dt

* Rename `pipe_map` to `student_to_teacher`
---
 spacy/cli/init_config.py                      | 15 +++-
 spacy/default_config_distillation.cfg         | 34 ++++++++
 spacy/language.py                             |  3 +
 spacy/schemas.py                              | 23 +++++
 .../tests/serialize/test_serialize_config.py  | 85 +++++++++++++++----
 5 files changed, 143 insertions(+), 17 deletions(-)
 create mode 100644 spacy/default_config_distillation.cfg

diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index a7c03d00f90..129b5a24e84 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -9,7 +9,7 @@
 from wasabi import Printer, diff_strings
 
 from .. import util
-from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
+from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
 from ._util import (
@@ -90,6 +90,7 @@ def init_fill_config_cli(
     # fmt: off
     base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
     output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
+    distillation: bool = Opt(False, "--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"),
     pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
     diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
     code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
@@ -105,13 +106,20 @@ def init_fill_config_cli(
     DOCS: https://spacy.io/api/cli#init-fill-config
     """
     import_code(code_path)
-    fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
+    fill_config(
+        output_file,
+        base_path,
+        distillation=distillation,
+        pretraining=pretraining,
+        diff=diff,
+    )
 
 
 def fill_config(
     output_file: Path,
     base_path: Path,
     *,
+    distillation: bool = False,
     pretraining: bool = False,
     diff: bool = False,
     silent: bool = False,
@@ -130,6 +138,9 @@ def fill_config(
     # replaced with their actual config after loading, so we have to re-add them
     sourced = util.get_sourced_components(config)
     filled["components"].update(sourced)
+    if distillation:
+        distillation_config = util.load_config(DEFAULT_CONFIG_DISTILL_PATH)
+        filled = distillation_config.merge(filled)
     if pretraining:
         validate_config_for_pretrain(filled, msg)
         pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
diff --git a/spacy/default_config_distillation.cfg b/spacy/default_config_distillation.cfg
new file mode 100644
index 00000000000..1926fafa961
--- /dev/null
+++ b/spacy/default_config_distillation.cfg
@@ -0,0 +1,34 @@
+[paths]
+raw_text = null
+
+[distillation]
+corpus = "corpora.distillation"
+dropout = 0.1
+max_epochs = 1
+max_steps = 0
+student_to_teacher = {}
+
+[distillation.batcher]
+@batchers = "spacy.batch_by_words.v1"
+size = 3000
+discard_oversize = false
+tolerance = 0.2
+
+[distillation.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 1e-4
+
+[corpora]
+
+[corpora.distillation]
+@readers = "spacy.PlainTextCorpus.v1"
+path = ${paths.raw_text}
+min_length = 0
+max_length = 0
diff --git a/spacy/language.py b/spacy/language.py
index 8cd439d10b1..a1fa61d0923 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -67,6 +67,9 @@
 # This is the base config will all settings (training etc.)
 DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
 DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
+# This is the base config for the [distillation] block and currently not included
+# in the main config and only added via the 'init fill-config' command
+DEFAULT_CONFIG_DISTILL_PATH = Path(__file__).parent / "default_config_distillation.cfg"
 # This is the base config for the [pretraining] block and currently not included
 # in the main config and only added via the 'init fill-config' command
 DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 831f7df058f..32fb042b5a0 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -470,6 +470,27 @@ class Config:
         arbitrary_types_allowed = True
 
 
+class ConfigSchemaDistillEmpty(BaseModel):
+    class Config:
+        extra = "forbid"
+
+
+class ConfigSchemaDistill(BaseModel):
+    # fmt: off
+    batcher: Batcher = Field(..., title="Batcher for the training data")
+    corpus: StrictStr = Field(..., title="Path in the config to the distillation data")
+    dropout: StrictFloat = Field(..., title="Dropout rate")
+    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to distill for")
+    max_steps: StrictInt = Field(..., title="Maximum number of steps to distill for")
+    optimizer: Optimizer = Field(..., title="The optimizer to use")
+    student_to_teacher: Dict[str, str] = Field(..., title="Mapping from student to teacher pipe")
+    # fmt: on
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
 class ConfigSchema(BaseModel):
     training: ConfigSchemaTraining
     nlp: ConfigSchemaNlp
@@ -477,6 +498,7 @@ class ConfigSchema(BaseModel):
     components: Dict[str, Dict[str, Any]]
     corpora: Dict[str, Reader]
     initialize: ConfigSchemaInit
+    distillation: Union[ConfigSchemaDistill, ConfigSchemaDistillEmpty] = {}  # type: ignore[assignment]
 
     class Config:
         extra = "allow"
@@ -488,6 +510,7 @@ class Config:
     "training": ConfigSchemaTraining,
     "pretraining": ConfigSchemaPretrain,
     "initialize": ConfigSchemaInit,
+    "distill": ConfigSchemaDistill,
 }
 
 # Recommendations for init config workflows
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index dd0a53c910e..eb0dcc1e38c 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -5,21 +5,14 @@
 import spacy
 from spacy.lang.de import German
 from spacy.lang.en import English
-from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH, Language
-from spacy.ml.models import (
-    MaxoutWindowEncoder,
-    MultiHashEmbed,
-    build_tb_parser_model,
-    build_Tok2Vec_model,
-)
-from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
-from spacy.training import Example
-from spacy.util import (
-    load_config,
-    load_config_from_str,
-    load_model_from_config,
-    registry,
-)
+from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
+from spacy.language import DEFAULT_CONFIG_DISTILL_PATH
+from spacy.language import Language
+from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
+from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
+from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
+from spacy.util import load_config, load_config_from_str
+from spacy.util import load_model_from_config, registry
 
 from ..util import make_tempdir
 
@@ -74,6 +67,60 @@
 width = ${components.tok2vec.model.width}
 """
 
+distill_config_string = """
+[paths]
+train = null
+dev = null
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+
+[training]
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+size = 666
+
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 342
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
+subword_features = true
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v2"
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.width}
+
+[distill]
+"""
+
+
 pretrain_config_string = """
 [paths]
 train = null
@@ -209,6 +256,14 @@ def test_create_nlp_from_config():
         load_model_from_config(Config(bad_cfg), auto_fill=True)
 
 
+def test_nlp_from_distillation_config():
+    """Test that the default distillation config validates properly"""
+    config = Config().from_str(distill_config_string)
+    distill_config = load_config(DEFAULT_CONFIG_DISTILL_PATH)
+    filled = config.merge(distill_config)
+    registry.resolve(filled["distillation"], schema=ConfigSchemaDistill)
+
+
 def test_create_nlp_from_pretraining_config():
     """Test that the default pretraining config validates properly"""
     config = Config().from_str(pretrain_config_string)

From f76f2858bbf0c98b7e8a44665acec0e057871272 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 31 Jan 2023 13:19:42 +0100
Subject: [PATCH 047/504] Language.distill: copy both reference and predicted
 (#12209)

* Language.distill: copy both reference and predicted

In distillation we also modify the teacher docs (e.g. in tok2vec
components), so we need to copy both the reference and predicted doc.

Problem caught by @shadeMe

* Make new `_copy_examples` args kwonly
---
 spacy/language.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index a1fa61d0923..cb9652e97bf 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1094,7 +1094,7 @@ def distill(
             return losses
 
         validate_distillation_examples(examples, "Language.distill")
-        examples = _copy_examples(examples)
+        examples = _copy_examples(examples, copy_x=True, copy_y=True)
 
         if sgd is None:
             if self._optimizer is None:
@@ -2409,13 +2409,18 @@ def restore(self) -> None:
         self[:] = []
 
 
-def _copy_examples(examples: Iterable[Example]) -> List[Example]:
+def _copy_examples(
+    examples: Iterable[Example], *, copy_x: bool = True, copy_y: bool = False
+) -> List[Example]:
     """Make a copy of a batch of examples, copying the predicted Doc as well.
     This is used in contexts where we need to take ownership of the examples
     so that they can be mutated, for instance during Language.evaluate and
     Language.update.
     """
-    return [Example(eg.x.copy(), eg.y) for eg in examples]
+    return [
+        Example(eg.x.copy() if copy_x else eg.x, eg.y.copy() if copy_y else eg.y)
+        for eg in examples
+    ]
 
 
 def _apply_pipes(

From dea819537be2edf08b434be6dbccb077aff7c5ad Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Tue, 31 Jan 2023 17:30:43 +0100
Subject: [PATCH 048/504] Rename language codes (Icelandic, multi-language)
 (#12149)

* Init

* fix tests

* Update spacy/errors.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Fix test_blank_languages

* Rename xx to mul in docs

* Format _util with black

* prettier formatting

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/_util.py                            | 153 ++++++++++++++++++
 spacy/cli/convert.py                          |   6 +
 spacy/cli/init_config.py                      |  18 +--
 spacy/cli/init_pipeline.py                    |  19 +--
 spacy/errors.py                               |   1 +
 spacy/lang/{is => isl}/__init__.py            |   2 +-
 spacy/lang/{is => isl}/stop_words.py          |   0
 spacy/lang/{xx => mul}/__init__.py            |   4 +-
 spacy/lang/{xx => mul}/examples.py            |   0
 spacy/scorer.py                               |   2 +-
 spacy/tests/README.md                         |   2 +-
 spacy/tests/conftest.py                       |  10 +-
 spacy/tests/doc/test_doc_api.py               |   2 +-
 spacy/tests/lang/{is => isl}/__init__.py      |   0
 spacy/tests/lang/{is => isl}/test_text.py     |   8 +-
 .../tests/lang/{is => isl}/test_tokenizer.py  |   8 +-
 spacy/tests/lang/{xx => mul}/__init__.py      |   0
 spacy/tests/lang/{xx => mul}/test_text.py     |   4 +-
 .../tests/lang/{xx => mul}/test_tokenizer.py  |   8 +-
 spacy/tests/lang/test_initialize.py           |   6 +-
 spacy/tests/pipeline/test_span_ruler.py       |  52 +++---
 spacy/tests/test_language.py                  |   9 +-
 spacy/tests/tokenizer/test_explain.py         |   1 +
 .../training/converters/conll_ner_to_docs.py  |   4 +-
 spacy/training/converters/json_to_docs.py     |  12 +-
 spacy/util.py                                 |   8 +-
 website/docs/api/scorer.mdx                   |   2 +-
 website/docs/usage/models.mdx                 |  12 +-
 website/meta/languages.json                   |   6 +-
 website/src/widgets/quickstart-models.js      |   2 +-
 30 files changed, 254 insertions(+), 107 deletions(-)
 rename spacy/lang/{is => isl}/__init__.py (93%)
 rename spacy/lang/{is => isl}/stop_words.py (100%)
 rename spacy/lang/{xx => mul}/__init__.py (67%)
 rename spacy/lang/{xx => mul}/examples.py (100%)
 rename spacy/tests/lang/{is => isl}/__init__.py (100%)
 rename spacy/tests/lang/{is => isl}/test_text.py (85%)
 rename spacy/tests/lang/{is => isl}/test_tokenizer.py (72%)
 rename spacy/tests/lang/{xx => mul}/__init__.py (100%)
 rename spacy/tests/lang/{xx => mul}/test_text.py (96%)
 rename spacy/tests/lang/{xx => mul}/test_tokenizer.py (68%)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index ea91e64247d..52a70cc7320 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -37,6 +37,7 @@
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
 from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
+from ..errors import RENAMED_LANGUAGE_CODES
 from .. import about
 from ..compat import Literal
 from ..schemas import validate
@@ -158,6 +159,158 @@ def _parse_override(value: Any) -> Any:
         return str(value)
 
 
+def _handle_renamed_language_codes(lang: Optional[str]) -> None:
+    # Throw error for renamed language codes in v4
+    if lang in RENAMED_LANGUAGE_CODES:
+        msg.fail(
+            title="Renamed language code",
+            text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
+            exits=1,
+        )
+
+
+def load_project_config(
+    path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
+) -> Dict[str, Any]:
+    """Load the project.yml file from a directory and validate it. Also make
+    sure that all directories defined in the config exist.
+
+    path (Path): The path to the project directory.
+    interpolate (bool): Whether to substitute project variables.
+    overrides (Dict[str, Any]): Optional config overrides.
+    RETURNS (Dict[str, Any]): The loaded project.yml.
+    """
+    config_path = path / PROJECT_FILE
+    if not config_path.exists():
+        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
+    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
+    try:
+        config = srsly.read_yaml(config_path)
+    except ValueError as e:
+        msg.fail(invalid_err, e, exits=1)
+    errors = validate(ProjectConfigSchema, config)
+    if errors:
+        msg.fail(invalid_err)
+        print("\n".join(errors))
+        sys.exit(1)
+    validate_project_version(config)
+    validate_project_commands(config)
+    if interpolate:
+        err = f"{PROJECT_FILE} validation error"
+        with show_validation_error(title=err, hint_fill=False):
+            config = substitute_project_variables(config, overrides)
+    # Make sure directories defined in config exist
+    for subdir in config.get("directories", []):
+        dir_path = path / subdir
+        if not dir_path.exists():
+            dir_path.mkdir(parents=True)
+    return config
+
+
+def substitute_project_variables(
+    config: Dict[str, Any],
+    overrides: Dict[str, Any] = SimpleFrozenDict(),
+    key: str = "vars",
+    env_key: str = "env",
+) -> Dict[str, Any]:
+    """Interpolate variables in the project file using the config system.
+
+    config (Dict[str, Any]): The project config.
+    overrides (Dict[str, Any]): Optional config overrides.
+    key (str): Key containing variables in project config.
+    env_key (str): Key containing environment variable mapping in project config.
+    RETURNS (Dict[str, Any]): The interpolated project config.
+    """
+    config.setdefault(key, {})
+    config.setdefault(env_key, {})
+    # Substitute references to env vars with their values
+    for config_var, env_var in config[env_key].items():
+        config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
+    # Need to put variables in the top scope again so we can have a top-level
+    # section "project" (otherwise, a list of commands in the top scope wouldn't)
+    # be allowed by Thinc's config system
+    cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
+    cfg = Config().from_str(cfg.to_str(), overrides=overrides)
+    interpolated = cfg.interpolate()
+    return dict(interpolated["project"])
+
+
+def validate_project_version(config: Dict[str, Any]) -> None:
+    """If the project defines a compatible spaCy version range, chec that it's
+    compatible with the current version of spaCy.
+
+    config (Dict[str, Any]): The loaded config.
+    """
+    spacy_version = config.get("spacy_version", None)
+    if spacy_version and not is_compatible_version(about.__version__, spacy_version):
+        err = (
+            f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
+            f"that's not compatible with the version of spaCy you're running "
+            f"({about.__version__}). You can edit version requirement in the "
+            f"{PROJECT_FILE} to load it, but the project may not run as expected."
+        )
+        msg.fail(err, exits=1)
+
+
+def validate_project_commands(config: Dict[str, Any]) -> None:
+    """Check that project commands and workflows are valid, don't contain
+    duplicates, don't clash  and only refer to commands that exist.
+
+    config (Dict[str, Any]): The loaded config.
+    """
+    command_names = [cmd["name"] for cmd in config.get("commands", [])]
+    workflows = config.get("workflows", {})
+    duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
+    if duplicates:
+        err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
+        msg.fail(err, exits=1)
+    for workflow_name, workflow_steps in workflows.items():
+        if workflow_name in command_names:
+            err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
+            msg.fail(err, exits=1)
+        for step in workflow_steps:
+            if step not in command_names:
+                msg.fail(
+                    f"Unknown command specified in workflow '{workflow_name}': {step}",
+                    f"Workflows can only refer to commands defined in the 'commands' "
+                    f"section of the {PROJECT_FILE}.",
+                    exits=1,
+                )
+
+
+def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
+    """Get the hash for a JSON-serializable object.
+
+    data: The data to hash.
+    exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
+    RETURNS (str): The hash.
+    """
+    if isinstance(data, dict):
+        data = {k: v for k, v in data.items() if k not in exclude}
+    data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
+    return hashlib.md5(data_str).hexdigest()
+
+
+def get_checksum(path: Union[Path, str]) -> str:
+    """Get the checksum for a file or directory given its file path. If a
+    directory path is provided, this uses all files in that directory.
+
+    path (Union[Path, str]): The file or directory path.
+    RETURNS (str): The checksum.
+    """
+    path = Path(path)
+    if not (path.is_file() or path.is_dir()):
+        msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
+    if path.is_file():
+        return hashlib.md5(Path(path).read_bytes()).hexdigest()
+    else:
+        # TODO: this is currently pretty slow
+        dir_checksum = hashlib.md5()
+        for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
+            dir_checksum.update(sub_file.read_bytes())
+        return dir_checksum.hexdigest()
+
+
 @contextmanager
 def show_validation_error(
     file_path: Optional[Union[str, Path]] = None,
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index a66a68133b3..3844b340678 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -8,6 +8,8 @@
 import srsly
 from wasabi import Printer
 
+from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
+from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training import docs_to_json
 from ..training.converters import (
@@ -116,6 +118,10 @@ def convert(
     input_path = Path(input_path)
     if not msg:
         msg = Printer(no_print=silent)
+
+    # Throw error for renamed language codes in v4
+    _handle_renamed_language_codes(lang)
+
     ner_map = srsly.read_json(ner_map) if ner_map is not None else None
     doc_files = []
     for input_loc in walk_directory(input_path, converter):
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index 129b5a24e84..b29a2b748f2 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -12,15 +12,9 @@
 from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
-from ._util import (
-    COMMAND,
-    Arg,
-    Opt,
-    import_code,
-    init_cli,
-    show_validation_error,
-    string_to_list,
-)
+from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
+from ._util import string_to_list, import_code, _handle_renamed_language_codes
+
 
 ROOT = Path(__file__).parent / "templates"
 TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
@@ -50,7 +44,7 @@ class InitValues:
 def init_config_cli(
     # fmt: off
     output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
-    lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
+    lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"),
     pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
     optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
     gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
@@ -176,6 +170,10 @@ def init_config(
     msg = Printer(no_print=silent)
     with TEMPLATE_PATH.open("r") as f:
         template = Template(f.read())
+
+    # Throw error for renamed language codes in v4
+    _handle_renamed_language_codes(lang)
+
     # Filter out duplicates since tok2vec and transformer are added by template
     pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
     defaults = RECOMMENDATIONS["__default__"]
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 21eea8edf2f..0ff39d2145b 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -8,16 +8,8 @@
 
 from .. import util
 from ..language import Language
-from ..training.initialize import convert_vectors, init_nlp
-from ._util import (
-    Arg,
-    Opt,
-    import_code,
-    init_cli,
-    parse_config_overrides,
-    setup_gpu,
-    show_validation_error,
-)
+from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code, setup_gpu, _handle_renamed_language_codes
 
 
 @init_cli.command("vectors")
@@ -39,8 +31,11 @@ def init_vectors_cli(
     you can use in the [initialize] block of your config to initialize
     a model with vectors.
     """
-    if verbose:
-        util.logger.setLevel(logging.DEBUG)
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+
+    # Throw error for renamed language codes in v4
+    _handle_renamed_language_codes(lang)
+
     msg.info(f"Creating blank nlp object for language '{lang}'")
     nlp = util.get_lang_class(lang)()
     if jsonl_loc is not None:
diff --git a/spacy/errors.py b/spacy/errors.py
index dcf8e60b7a1..c8c595395b3 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -981,6 +981,7 @@ class Errors(metaclass=ErrorsWithCodes):
              "reference and predicted docs.")
     E4004 = ("Backprop is not supported when is_train is not set.")
 
+RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
 # fmt: on
 
diff --git a/spacy/lang/is/__init__.py b/spacy/lang/isl/__init__.py
similarity index 93%
rename from spacy/lang/is/__init__.py
rename to spacy/lang/isl/__init__.py
index af126004536..50929620ced 100644
--- a/spacy/lang/is/__init__.py
+++ b/spacy/lang/isl/__init__.py
@@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults):
 
 
 class Icelandic(Language):
-    lang = "is"
+    lang = "isl"
     Defaults = IcelandicDefaults
 
 
diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/isl/stop_words.py
similarity index 100%
rename from spacy/lang/is/stop_words.py
rename to spacy/lang/isl/stop_words.py
diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/mul/__init__.py
similarity index 67%
rename from spacy/lang/xx/__init__.py
rename to spacy/lang/mul/__init__.py
index aff8403ffc7..5170f1e861f 100644
--- a/spacy/lang/xx/__init__.py
+++ b/spacy/lang/mul/__init__.py
@@ -3,10 +3,10 @@
 
 class MultiLanguage(Language):
     """Language class to be used for models that support multiple languages.
-    This module allows models to specify their language ID as 'xx'.
+    This module allows models to specify their language ID as 'mul'.
     """
 
-    lang = "xx"
+    lang = "mul"
 
 
 __all__ = ["MultiLanguage"]
diff --git a/spacy/lang/xx/examples.py b/spacy/lang/mul/examples.py
similarity index 100%
rename from spacy/lang/xx/examples.py
rename to spacy/lang/mul/examples.py
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 9ab116deb3f..b590f86337e 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -114,7 +114,7 @@ class Scorer:
     def __init__(
         self,
         nlp: Optional["Language"] = None,
-        default_lang: str = "xx",
+        default_lang: str = "mul",
         default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
         **cfg,
     ) -> None:
diff --git a/spacy/tests/README.md b/spacy/tests/README.md
index f3c96a39e7c..9ac1e6d2e34 100644
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@@ -86,7 +86,7 @@ These are the main fixtures that are currently available:
 
 | Fixture                             | Description                                                                  |
 | ----------------------------------- | ---------------------------------------------------------------------------- |
-| `tokenizer`                         | Basic, language-independent tokenizer. Identical to the `xx` language class. |
+| `tokenizer`                         | Basic, language-independent tokenizer. Identical to the `mul` language class. |
 | `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer.                                   |
 | `en_vocab`                          | Creates an instance of the English `Vocab`.                                  |
 
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 6085b89cf02..fdc9f192c2f 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -85,7 +85,7 @@ def register_cython_tests(cython_mod_name: str, test_mod_name: str):
 
 @pytest.fixture(scope="module")
 def tokenizer():
-    return get_lang_class("xx")().tokenizer
+    return get_lang_class("mul")().tokenizer
 
 
 @pytest.fixture(scope="session")
@@ -250,8 +250,8 @@ def id_tokenizer():
 
 
 @pytest.fixture(scope="session")
-def is_tokenizer():
-    return get_lang_class("is")().tokenizer
+def isl_tokenizer():
+    return get_lang_class("isl")().tokenizer
 
 
 @pytest.fixture(scope="session")
@@ -513,8 +513,8 @@ def vi_tokenizer():
 
 
 @pytest.fixture(scope="session")
-def xx_tokenizer():
-    return get_lang_class("xx")().tokenizer
+def mul_tokenizer():
+    return get_lang_class("mul")().tokenizer
 
 
 @pytest.fixture(scope="session")
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 946910b29e1..518db02e6b3 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -18,7 +18,7 @@
     TAG,
 )
 from spacy.lang.en import English
-from spacy.lang.xx import MultiLanguage
+from spacy.lang.mul import MultiLanguage
 from spacy.language import Language
 from spacy.lexeme import Lexeme
 from spacy.tokens import Doc, Span, SpanGroup, Token
diff --git a/spacy/tests/lang/is/__init__.py b/spacy/tests/lang/isl/__init__.py
similarity index 100%
rename from spacy/tests/lang/is/__init__.py
rename to spacy/tests/lang/isl/__init__.py
diff --git a/spacy/tests/lang/is/test_text.py b/spacy/tests/lang/isl/test_text.py
similarity index 85%
rename from spacy/tests/lang/is/test_text.py
rename to spacy/tests/lang/isl/test_text.py
index 6e3654a6eda..9e177485d09 100644
--- a/spacy/tests/lang/is/test_text.py
+++ b/spacy/tests/lang/isl/test_text.py
@@ -1,7 +1,7 @@
 import pytest
 
 
-def test_long_text(is_tokenizer):
+def test_long_text(isl_tokenizer):
     # Excerpt: European Convention on Human Rights
     text = """
 hafa í huga, að yfirlýsing þessi hefur það markmið að tryggja
@@ -15,12 +15,12 @@ def test_long_text(is_tokenizer):
 virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi
 og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins;
 """
-    tokens = is_tokenizer(text)
+    tokens = isl_tokenizer(text)
     assert len(tokens) == 120
 
 
 @pytest.mark.xfail
-def test_ordinal_number(is_tokenizer):
+def test_ordinal_number(isl_tokenizer):
     text = "10. desember 1948"
-    tokens = is_tokenizer(text)
+    tokens = isl_tokenizer(text)
     assert len(tokens) == 3
diff --git a/spacy/tests/lang/is/test_tokenizer.py b/spacy/tests/lang/isl/test_tokenizer.py
similarity index 72%
rename from spacy/tests/lang/is/test_tokenizer.py
rename to spacy/tests/lang/isl/test_tokenizer.py
index 0c05a605001..ba534aaf662 100644
--- a/spacy/tests/lang/is/test_tokenizer.py
+++ b/spacy/tests/lang/isl/test_tokenizer.py
@@ -1,6 +1,6 @@
 import pytest
 
-IS_BASIC_TOKENIZATION_TESTS = [
+ISL_BASIC_TOKENIZATION_TESTS = [
     (
         "Enginn maður skal sæta pyndingum eða ómannlegri eða "
         "vanvirðandi meðferð eða refsingu. ",
@@ -23,8 +23,8 @@
 ]
 
 
-@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS)
-def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens):
-    tokens = is_tokenizer(text)
+@pytest.mark.parametrize("text,expected_tokens", ISL_BASIC_TOKENIZATION_TESTS)
+def test_isl_tokenizer_basic(isl_tokenizer, text, expected_tokens):
+    tokens = isl_tokenizer(text)
     token_list = [token.text for token in tokens if not token.is_space]
     assert expected_tokens == token_list
diff --git a/spacy/tests/lang/xx/__init__.py b/spacy/tests/lang/mul/__init__.py
similarity index 100%
rename from spacy/tests/lang/xx/__init__.py
rename to spacy/tests/lang/mul/__init__.py
diff --git a/spacy/tests/lang/xx/test_text.py b/spacy/tests/lang/mul/test_text.py
similarity index 96%
rename from spacy/tests/lang/xx/test_text.py
rename to spacy/tests/lang/mul/test_text.py
index 477f0ebe271..6e4262d6696 100644
--- a/spacy/tests/lang/xx/test_text.py
+++ b/spacy/tests/lang/mul/test_text.py
@@ -1,7 +1,7 @@
 import pytest
 
 
-def test_long_text(xx_tokenizer):
+def test_long_text(mul_tokenizer):
     # Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi
     text = """
 Säʹmmla lie Euroopp unioon oʹdinakai alggmeer. Säʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest.  
@@ -20,5 +20,5 @@ def test_long_text(xx_tokenizer):
 Sääʹmteʹǧǧ.
 """
 
-    tokens = xx_tokenizer(text)
+    tokens = mul_tokenizer(text)
     assert len(tokens) == 179
diff --git a/spacy/tests/lang/xx/test_tokenizer.py b/spacy/tests/lang/mul/test_tokenizer.py
similarity index 68%
rename from spacy/tests/lang/xx/test_tokenizer.py
rename to spacy/tests/lang/mul/test_tokenizer.py
index 15c760a6b85..3d06dc11cf7 100644
--- a/spacy/tests/lang/xx/test_tokenizer.py
+++ b/spacy/tests/lang/mul/test_tokenizer.py
@@ -1,6 +1,6 @@
 import pytest
 
-XX_BASIC_TOKENIZATION_TESTS = [
+MUL_BASIC_TOKENIZATION_TESTS = [
     (
         "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
         [
@@ -18,8 +18,8 @@
 ]
 
 
-@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS)
-def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens):
-    tokens = xx_tokenizer(text)
+@pytest.mark.parametrize("text,expected_tokens", MUL_BASIC_TOKENIZATION_TESTS)
+def test_mul_tokenizer_basic(mul_tokenizer, text, expected_tokens):
+    tokens = mul_tokenizer(text)
     token_list = [token.text for token in tokens if not token.is_space]
     assert expected_tokens == token_list
diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py
index 8a158647a69..e0fd534d317 100644
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@@ -7,10 +7,10 @@
 # excluded: ja, ko, th, vi, zh
 LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
              "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
-             "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
-             "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
+             "hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv",
+             "mk", "ml", "mr", "mul", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
              "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
-             "tr", "tt", "uk", "ur", "xx", "yo"]
+             "tr", "tt", "uk", "ur", "yo"]
 # fmt: on
 
 
diff --git a/spacy/tests/pipeline/test_span_ruler.py b/spacy/tests/pipeline/test_span_ruler.py
index 0a8616f449b..3dfbccf28e2 100644
--- a/spacy/tests/pipeline/test_span_ruler.py
+++ b/spacy/tests/pipeline/test_span_ruler.py
@@ -46,7 +46,7 @@ def person_org_date_patterns(person_org_patterns):
 
 def test_span_ruler_add_empty(patterns):
     """Test that patterns don't get added excessively."""
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"validate": True})
     ruler.add_patterns(patterns)
     pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
@@ -57,7 +57,7 @@ def test_span_ruler_add_empty(patterns):
 
 
 def test_span_ruler_init(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
@@ -73,7 +73,7 @@ def test_span_ruler_init(patterns):
 
 
 def test_span_ruler_no_patterns_warns():
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     assert len(ruler) == 0
     assert len(ruler.labels) == 0
@@ -85,7 +85,7 @@ def test_span_ruler_no_patterns_warns():
 
 def test_span_ruler_init_patterns(patterns):
     # initialize with patterns
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     assert len(ruler.labels) == 0
     ruler.initialize(lambda: [], patterns=patterns)
@@ -109,7 +109,7 @@ def test_span_ruler_init_patterns(patterns):
 
 def test_span_ruler_init_clear(patterns):
     """Test that initialization clears patterns."""
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler.labels) == 4
@@ -118,7 +118,7 @@ def test_span_ruler_init_clear(patterns):
 
 
 def test_span_ruler_clear(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler.labels) == 4
@@ -132,7 +132,7 @@ def test_span_ruler_clear(patterns):
 
 
 def test_span_ruler_existing(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"overwrite": False})
     ruler.add_patterns(patterns)
     doc = nlp.make_doc("OH HELLO WORLD bye bye")
@@ -147,7 +147,7 @@ def test_span_ruler_existing(patterns):
 
 
 def test_span_ruler_existing_overwrite(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
     ruler.add_patterns(patterns)
     doc = nlp.make_doc("OH HELLO WORLD bye bye")
@@ -160,13 +160,13 @@ def test_span_ruler_existing_overwrite(patterns):
 
 
 def test_span_ruler_serialize_bytes(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     ruler_bytes = ruler.to_bytes()
-    new_nlp = spacy.blank("xx")
+    new_nlp = spacy.blank("mul")
     new_ruler = new_nlp.add_pipe("span_ruler")
     assert len(new_ruler) == 0
     assert len(new_ruler.labels) == 0
@@ -180,7 +180,7 @@ def test_span_ruler_serialize_bytes(patterns):
 
 
 def test_span_ruler_validate():
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     validated_ruler = nlp.add_pipe(
         "span_ruler", name="validated_span_ruler", config={"validate": True}
@@ -202,14 +202,14 @@ def test_span_ruler_validate():
 
 
 def test_span_ruler_properties(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
     ruler.add_patterns(patterns)
     assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns]))
 
 
 def test_span_ruler_overlapping_spans(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(overlapping_patterns)
     doc = ruler(nlp.make_doc("foo bar baz"))
@@ -219,7 +219,7 @@ def test_span_ruler_overlapping_spans(overlapping_patterns):
 
 
 def test_span_ruler_scorer(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(overlapping_patterns)
     text = "foo bar baz"
@@ -242,7 +242,7 @@ def test_span_ruler_multiprocessing(n_process):
 
         patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}]
 
-        nlp = spacy.blank("xx")
+        nlp = spacy.blank("mul")
         ruler = nlp.add_pipe("span_ruler")
         ruler.add_patterns(patterns)
 
@@ -252,7 +252,7 @@ def test_span_ruler_multiprocessing(n_process):
 
 
 def test_span_ruler_serialize_dir(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     with make_tempdir() as d:
@@ -263,7 +263,7 @@ def test_span_ruler_serialize_dir(patterns):
 
 
 def test_span_ruler_remove_basic(person_org_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_patterns)
     doc = ruler(nlp.make_doc("Dina went to school"))
@@ -278,7 +278,7 @@ def test_span_ruler_remove_basic(person_org_patterns):
 
 
 def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_patterns)
     assert len(ruler.patterns) == 3
@@ -289,7 +289,7 @@ def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
 
 
 def test_span_ruler_remove_several_patterns(person_org_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_patterns)
     doc = ruler(nlp.make_doc("Dina founded the company ACME."))
@@ -313,7 +313,7 @@ def test_span_ruler_remove_several_patterns(person_org_patterns):
 
 
 def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_date_patterns)
     doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th"))
@@ -331,7 +331,7 @@ def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
 
 
 def test_span_ruler_remove_all_patterns(person_org_date_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_date_patterns)
     assert len(ruler.patterns) == 4
@@ -347,7 +347,7 @@ def test_span_ruler_remove_all_patterns(person_org_date_patterns):
 
 
 def test_span_ruler_remove_and_add():
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     patterns1 = [{"label": "DATE1", "pattern": "last time"}]
     ruler.add_patterns(patterns1)
@@ -403,7 +403,7 @@ def test_span_ruler_remove_and_add():
 
 
 def test_span_ruler_spans_filter(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe(
         "span_ruler",
         config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}},
@@ -415,7 +415,7 @@ def test_span_ruler_spans_filter(overlapping_patterns):
 
 
 def test_span_ruler_ents_default_filter(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True})
     ruler.add_patterns(overlapping_patterns)
     doc = ruler(nlp.make_doc("foo bar baz"))
@@ -424,7 +424,7 @@ def test_span_ruler_ents_default_filter(overlapping_patterns):
 
 
 def test_span_ruler_ents_overwrite_filter(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe(
         "span_ruler",
         config={
@@ -451,7 +451,7 @@ def pass_through_filter(spans1, spans2):
 
         return pass_through_filter
 
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe(
         "span_ruler",
         config={
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 8138cb157d2..b419d77b51d 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -666,11 +666,12 @@ def test_spacy_blank():
         ("fra", "fr"),
         ("fre", "fr"),
         ("iw", "he"),
+        ("is", "isl"),
         ("mo", "ro"),
-        ("mul", "xx"),
+        ("mul", "mul"),
         ("no", "nb"),
         ("pt-BR", "pt"),
-        ("xx", "xx"),
+        ("xx", "mul"),
         ("zh-Hans", "zh"),
         ("zh-Hant", None),
         ("zxx", None),
@@ -691,11 +692,11 @@ def test_language_matching(lang, target):
         ("fra", "fr"),
         ("fre", "fr"),
         ("iw", "he"),
+        ("is", "isl"),
         ("mo", "ro"),
-        ("mul", "xx"),
+        ("xx", "mul"),
         ("no", "nb"),
         ("pt-BR", "pt"),
-        ("xx", "xx"),
         ("zh-Hans", "zh"),
     ],
 )
diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py
index 78932f6539c..073899fa50a 100644
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@@ -36,6 +36,7 @@
     "hu",
     pytest.param("id", marks=pytest.mark.slow()),
     pytest.param("it", marks=pytest.mark.slow()),
+    pytest.param("isl", marks=pytest.mark.slow()),
     pytest.param("kn", marks=pytest.mark.slow()),
     pytest.param("lb", marks=pytest.mark.slow()),
     pytest.param("lt", marks=pytest.mark.slow()),
diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py
index b19d1791b27..c3490d4a494 100644
--- a/spacy/training/converters/conll_ner_to_docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@@ -86,7 +86,7 @@ def conll_ner_to_docs(
     if model:
         nlp = load_model(model)
     else:
-        nlp = get_lang_class("xx")()
+        nlp = get_lang_class("mul")()
     for conll_doc in input_data.strip().split(doc_delimiter):
         conll_doc = conll_doc.strip()
         if not conll_doc:
@@ -133,7 +133,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
             "Segmenting sentences with sentencizer. (Use `-b model` for "
             "improved parser-based sentence segmentation.)"
         )
-        nlp = get_lang_class("xx")()
+        nlp = get_lang_class("mul")()
         sentencizer = nlp.create_pipe("sentencizer")
     lines = doc.strip().split("\n")
     words = [line.strip().split()[0] for line in lines]
diff --git a/spacy/training/converters/json_to_docs.py b/spacy/training/converters/json_to_docs.py
index b4beedd2f27..1ff7a64e09d 100644
--- a/spacy/training/converters/json_to_docs.py
+++ b/spacy/training/converters/json_to_docs.py
@@ -1,13 +1,9 @@
 import srsly
-
-from ...lang.xx import MultiLanguage
-from ...util import load_model
-from ..example import (
-    _fix_legacy_dict_data,
-    _parse_example_dict_data,
-    annotations_to_doc,
-)
 from ..gold_io import json_iterate, json_to_annotations
+from ..example import annotations_to_doc
+from ..example import _fix_legacy_dict_data, _parse_example_dict_data
+from ...util import load_model
+from ...lang.mul import MultiLanguage
 
 
 def json_to_docs(input_data, model=None, **kwargs):
diff --git a/spacy/util.py b/spacy/util.py
index de04ee6e718..8c402a74ce9 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -325,7 +325,7 @@ def find_matching_language(lang: str) -> Optional[str]:
     import spacy.lang  # noqa: F401
 
     if lang == "xx":
-        return "xx"
+        return "mul"
 
     # Find out which language modules we have
     possible_languages = []
@@ -343,11 +343,7 @@ def find_matching_language(lang: str) -> Optional[str]:
     # is labeled that way is probably trying to be distinct from 'zh' and
     # shouldn't automatically match.
     match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
-    if match == "mul":
-        # Convert 'mul' back to spaCy's 'xx'
-        return "xx"
-    else:
-        return match
+    return match
 
 
 def get_lang_class(lang: str) -> Type["Language"]:
diff --git a/website/docs/api/scorer.mdx b/website/docs/api/scorer.mdx
index 9bdd0a8f435..0c2eefc6722 100644
--- a/website/docs/api/scorer.mdx
+++ b/website/docs/api/scorer.mdx
@@ -30,7 +30,7 @@ Create a new `Scorer`.
 | Name               | Description                                                                                                                                                                                                                               |
 | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `nlp`              | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ |
-| `default_lang`     | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~                                                                                                                                            |
+| `default_lang`     | The language to use for a default pipeline if `nlp` is not provided. Defaults to `mul`. ~~str~~                                                                                                                                           |
 | `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~                                                     |
 | _keyword-only_     |                                                                                                                                                                                                                                           |
 | `**kwargs`         | Any additional settings to pass on to the individual scoring methods. ~~Any~~                                                                                                                                                             |
diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx
index e74c37e3080..34927ff3e7b 100644
--- a/website/docs/usage/models.mdx
+++ b/website/docs/usage/models.mdx
@@ -74,23 +74,23 @@ your data.
 
 > ```python
 > # Standard import
-> from spacy.lang.xx import MultiLanguage
+> from spacy.lang.mul import MultiLanguage
 > nlp = MultiLanguage()
 >
 > # With lazy-loading
-> nlp = spacy.blank("xx")
+> nlp = spacy.blank("mul")
 > ```
 
 spaCy also supports pipelines trained on more than one language. This is
 especially useful for named entity recognition. The language ID used for
-multi-language or language-neutral pipelines is `xx`. The language class, a
+multi-language or language-neutral pipelines is `mul`. The language class, a
 generic subclass containing only the base language data, can be found in
-[`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx).
+[`lang/mul`](%%GITHUB_SPACY/spacy/lang/mul).
 
 To train a pipeline using the neutral multi-language class, you can set
-`lang = "xx"` in your [training config](/usage/training#config). You can also
+`lang = "mul"` in your [training config](/usage/training#config). You can also
 \import the `MultiLanguage` class directly, or call
-[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
+[`spacy.blank("mul")`](/api/top-level#spacy.blank) for lazy-loading.
 
 ### Chinese language support {id="chinese",version="2.3"}
 
diff --git a/website/meta/languages.json b/website/meta/languages.json
index d6a07809795..e520067ba20 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -169,7 +169,7 @@
             "has_examples": true
         },
         {
-            "code": "is",
+            "code": "isl",
             "name": "Icelandic"
         },
         {
@@ -454,9 +454,9 @@
             ]
         },
         {
-            "code": "xx",
+            "code": "mul",
             "name": "Multi-language",
-            "models": ["xx_ent_wiki_sm", "xx_sent_ud_sm"],
+            "models": ["mul_ent_wiki_sm", "mul_sent_ud_sm"],
             "example": "This is a sentence about Facebook."
         },
         {
diff --git a/website/src/widgets/quickstart-models.js b/website/src/widgets/quickstart-models.js
index b2a0a628018..4994dc22640 100644
--- a/website/src/widgets/quickstart-models.js
+++ b/website/src/widgets/quickstart-models.js
@@ -103,7 +103,7 @@ const QuickstartInstall = ({ id, title, description, children }) => {
                         </QS>
                         <QS config="example" prompt="python">
                             print([
-                            {code === 'xx'
+                            {code === 'mul'
                                 ? '(ent.text, ent.label) for ent in doc.ents'
                                 : '(w.text, w.pos_) for w in doc'}
                             ])

From 0ea769f576bdc9d89c19bbfd57c50f395d2c253d Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 1 Feb 2023 17:47:56 +0900
Subject: [PATCH 049/504] Move Entity Linker v1 to spacy-legacy (#12006)

* Move Entity Linker v1 component to spacy-legacy

This is a follow up to #11889 that moves the component instead of
removing it.

In general, we never import from spacy-legacy in spaCy proper. However,
to use this component, that kind of import will be necessary. I was able
to test this without issues, but is this current import strategy
acceptable? Or should we put the component in a registry?

* Use spacy-legacy pr for CI

This will need to be reverted before merging.

* Add temporary step to log installed spacy-legacy version

* Modify requirements.txt to trigger tests

* Add comment to Python to trigger tests

* TODO REVERT This is a commit with logic changes to trigger tests

* Remove pipe from YAML

Works locally, but possibly this is causing a quoting error or
something.

* Revert "TODO REVERT This is a commit with logic changes to trigger tests"

This reverts commit 689fae71f31de4f54a00dd7dae0c26b19563c027.

* Revert "Add comment to Python to trigger tests"

This reverts commit 11840fc59886658c59aeb186a20173f5ec7c4583.

* Add more logging

* Try installing directly in workflow

* Try explicitly uninstalling spacy-legacy first

* Cat requirements.txt to confirm contents

In the branch, the thinc version spec is `thinc>=8.1.0,<8.2.0`. But in
the logs, it's clear that a development release of 9.0 is being
installed. It's not clear why that would happen.

* Log requirements at start of build

* TODO REVERT Change thinc spec

Want to see what happens to the installed thinc spec with this change.

* Update thinc requirements

This makes it the same as it was before the merge, >=8.1.0,<8.2.0.

* Use same thinc version as v4 branch

* TODO REVERT Mark dependency check as xfail

spacy-legacy is specified as a git checkout in requirements.txt while
this PR is in progress, which makes the consistency check here fail.

* Remove debugging output / install step

* Revert "Remove debugging output / install step"

This reverts commit 923ea7448b5e819d73272bc4e43e8880a8598a07.

* Clean up debugging output

The manual install step with the URL fragment seems to have caused
issues on Windows due to the = in the URL being misinterpreted. On the
other hand, removing it seems to mean the git version of spacy-legacy
isn't actually installed.

This PR removes the URL fragment but keeps the direct command-line
install. Additionally, since it looks like this job is configured to use
the default shell (and not bash), it removes a comment that upsets the
Windows cmd shell.

* Revert "TODO REVERT Mark dependency check as xfail"

This reverts commit d4863ec1563b7819c31a865cb94262b7dc592b7e.

* Fix requirements.txt, increasing spacy-legacy version

* Raise spacy legacy version in setup.cfg

* Remove azure build workarounds

* make spacy-legacy version explicit in error message

* Remove debugging line

* Suggestions from code review
---
 spacy/pipeline/entity_linker.py            |  16 +
 spacy/pipeline/legacy/__init__.py          |   3 -
 spacy/pipeline/legacy/entity_linker.py     | 422 ---------------------
 spacy/tests/pipeline/test_entity_linker.py |   3 +-
 4 files changed, 18 insertions(+), 426 deletions(-)
 delete mode 100644 spacy/pipeline/legacy/__init__.py
 delete mode 100644 spacy/pipeline/legacy/entity_linker.py

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 9c4312f6dd8..db4f0e105c1 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -14,6 +14,16 @@
 from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate
 from thinc.types import Floats2d
 
+from ..kb import KnowledgeBase, Candidate
+from ..ml import empty_kb
+from ..tokens import Doc, Span
+from .pipe import deserialize_config
+from .trainable_pipe import TrainablePipe
+from ..language import Language
+from ..vocab import Vocab
+from ..training import Example, validate_examples, validate_get_examples
+from ..errors import Errors
+from ..util import SimpleFrozenList, registry
 from .. import util
 from ..errors import Errors
 from ..kb import Candidate, KnowledgeBase
@@ -128,6 +138,12 @@ def make_entity_linker(
     """
 
     if not model.attrs.get("include_span_maker", False):
+        try:
+            from spacy_legacy.components.entity_linker import EntityLinker_v1
+        except:
+            raise ImportError(
+                "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
+            )
         # The only difference in arguments here is that use_gold_ents and threshold aren't available.
         return EntityLinker_v1(
             nlp.vocab,
diff --git a/spacy/pipeline/legacy/__init__.py b/spacy/pipeline/legacy/__init__.py
deleted file mode 100644
index f216840dc2c..00000000000
--- a/spacy/pipeline/legacy/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .entity_linker import EntityLinker_v1
-
-__all__ = ["EntityLinker_v1"]
diff --git a/spacy/pipeline/legacy/entity_linker.py b/spacy/pipeline/legacy/entity_linker.py
deleted file mode 100644
index 1e46db019d5..00000000000
--- a/spacy/pipeline/legacy/entity_linker.py
+++ /dev/null
@@ -1,422 +0,0 @@
-# This file is present to provide a prior version of the EntityLinker component
-# for backwards compatability. For details see #9669.
-
-import random
-import warnings
-from itertools import islice
-from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
-
-import srsly
-from thinc.api import CosineDistance, Model, Optimizer, set_dropout_rate
-from thinc.types import Floats2d
-
-from ... import util
-from ...errors import Errors, Warnings
-from ...kb import Candidate, KnowledgeBase
-from ...language import Language
-from ...ml import empty_kb
-from ...scorer import Scorer
-from ...tokens import Doc, Span
-from ...training import Example, validate_examples, validate_get_examples
-from ...util import SimpleFrozenList
-from ...vocab import Vocab
-from ..pipe import deserialize_config
-from ..trainable_pipe import TrainablePipe
-
-# See #9050
-BACKWARD_OVERWRITE = True
-
-
-def entity_linker_score(examples, **kwargs):
-    return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs)
-
-
-class EntityLinker_v1(TrainablePipe):
-    """Pipeline component for named entity linking.
-
-    DOCS: https://spacy.io/api/entitylinker
-    """
-
-    NIL = "NIL"  # string used to refer to a non-existing link
-
-    def __init__(
-        self,
-        vocab: Vocab,
-        model: Model,
-        name: str = "entity_linker",
-        *,
-        labels_discard: Iterable[str],
-        n_sents: int,
-        incl_prior: bool,
-        incl_context: bool,
-        entity_vector_length: int,
-        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
-        overwrite: bool = BACKWARD_OVERWRITE,
-        scorer: Optional[Callable] = entity_linker_score,
-    ) -> None:
-        """Initialize an entity linker.
-
-        vocab (Vocab): The shared vocabulary.
-        model (thinc.api.Model): The Thinc Model powering the pipeline component.
-        name (str): The component instance name, used to add entries to the
-            losses during training.
-        labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
-        n_sents (int): The number of neighbouring sentences to take into account.
-        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
-        incl_context (bool): Whether or not to include the local context in the model.
-        entity_vector_length (int): Size of encoding vectors in the KB.
-        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
-            produces a list of candidates, given a certain knowledge base and a textual mention.
-        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
-        DOCS: https://spacy.io/api/entitylinker#init
-        """
-        self.vocab = vocab
-        self.model = model
-        self.name = name
-        self.labels_discard = list(labels_discard)
-        self.n_sents = n_sents
-        self.incl_prior = incl_prior
-        self.incl_context = incl_context
-        self.get_candidates = get_candidates
-        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
-        self.distance = CosineDistance(normalize=False)
-        # how many neighbour sentences to take into account
-        # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
-        self.kb = empty_kb(entity_vector_length)(self.vocab)
-        self.scorer = scorer
-
-    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
-        """Define the KB of this pipe by providing a function that will
-        create it using this object's vocab."""
-        if not callable(kb_loader):
-            raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
-
-        self.kb = kb_loader(self.vocab)
-
-    def validate_kb(self) -> None:
-        # Raise an error if the knowledge base is not initialized.
-        if self.kb is None:
-            raise ValueError(Errors.E1018.format(name=self.name))
-        if len(self.kb) == 0:
-            raise ValueError(Errors.E139.format(name=self.name))
-
-    def initialize(
-        self,
-        get_examples: Callable[[], Iterable[Example]],
-        *,
-        nlp: Optional[Language] = None,
-        kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
-    ):
-        """Initialize the pipe for training, using a representative set
-        of data examples.
-
-        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns a representative sample of gold-standard Example objects.
-        nlp (Language): The current nlp object the component is part of.
-        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
-            Note that providing this argument, will overwrite all data accumulated in the current KB.
-            Use this only when loading a KB as-such from file.
-
-        DOCS: https://spacy.io/api/entitylinker#initialize
-        """
-        validate_get_examples(get_examples, "EntityLinker_v1.initialize")
-        if kb_loader is not None:
-            self.set_kb(kb_loader)
-        self.validate_kb()
-        nO = self.kb.entity_vector_length
-        doc_sample = []
-        vector_sample = []
-        for example in islice(get_examples(), 10):
-            doc_sample.append(example.x)
-            vector_sample.append(self.model.ops.alloc1f(nO))
-        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize(
-            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
-        )
-
-    def update(
-        self,
-        examples: Iterable[Example],
-        *,
-        drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
-        losses: Optional[Dict[str, float]] = None,
-    ) -> Dict[str, float]:
-        """Learn from a batch of documents and gold-standard information,
-        updating the pipe's model. Delegates to predict and get_loss.
-
-        examples (Iterable[Example]): A batch of Example objects.
-        drop (float): The dropout rate.
-        sgd (thinc.api.Optimizer): The optimizer.
-        losses (Dict[str, float]): Optional record of the loss during training.
-            Updated using the component name as the key.
-        RETURNS (Dict[str, float]): The updated losses dictionary.
-
-        DOCS: https://spacy.io/api/entitylinker#update
-        """
-        self.validate_kb()
-        if losses is None:
-            losses = {}
-        losses.setdefault(self.name, 0.0)
-        if not examples:
-            return losses
-        validate_examples(examples, "EntityLinker_v1.update")
-        sentence_docs = []
-        for eg in examples:
-            sentences = [s for s in eg.reference.sents]
-            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.reference.ents:
-                # KB ID of the first token is the same as the whole span
-                kb_id = kb_ids[ent.start]
-                if kb_id:
-                    try:
-                        # find the sentence in the list of sentences.
-                        sent_index = sentences.index(ent.sent)
-                    except AttributeError:
-                        # Catch the exception when ent.sent is None and provide a user-friendly warning
-                        raise RuntimeError(Errors.E030) from None
-                    # get n previous sentences, if there are any
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    # get n posterior sentences, or as many < n as there are
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    # get token positions
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    # append that span as a doc to training
-                    sent_doc = eg.predicted[start_token:end_token].as_doc()
-                    sentence_docs.append(sent_doc)
-        set_dropout_rate(self.model, drop)
-        if not sentence_docs:
-            warnings.warn(Warnings.W093.format(name="Entity Linker"))
-            return losses
-        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
-        loss, d_scores = self.get_loss(
-            sentence_encodings=sentence_encodings, examples=examples
-        )
-        bp_context(d_scores)
-        if sgd is not None:
-            self.finish_update(sgd)
-        losses[self.name] += loss
-        return losses
-
-    def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
-        validate_examples(examples, "EntityLinker_v1.get_loss")
-        entity_encodings = []
-        for eg in examples:
-            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.reference.ents:
-                kb_id = kb_ids[ent.start]
-                if kb_id:
-                    entity_encoding = self.kb.get_vector(kb_id)
-                    entity_encodings.append(entity_encoding)
-        entity_encodings = self.model.ops.asarray2f(entity_encodings)
-        if sentence_encodings.shape != entity_encodings.shape:
-            err = Errors.E147.format(
-                method="get_loss", msg="gold entities do not match up"
-            )
-            raise RuntimeError(err)
-        gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
-        loss = self.distance.get_loss(sentence_encodings, entity_encodings)
-        loss = loss / len(entity_encodings)
-        return float(loss), gradients
-
-    def predict(self, docs: Iterable[Doc]) -> List[str]:
-        """Apply the pipeline's model to a batch of docs, without modifying them.
-        Returns the KB IDs for each entity in each doc, including NIL if there is
-        no prediction.
-
-        docs (Iterable[Doc]): The documents to predict.
-        RETURNS (List[str]): The models prediction for each document.
-
-        DOCS: https://spacy.io/api/entitylinker#predict
-        """
-        self.validate_kb()
-        entity_count = 0
-        final_kb_ids: List[str] = []
-        if not docs:
-            return final_kb_ids
-        if isinstance(docs, Doc):
-            docs = [docs]
-        for i, doc in enumerate(docs):
-            sentences = [s for s in doc.sents]
-            if len(doc) > 0:
-                # Looping through each entity (TODO: rewrite)
-                for ent in doc.ents:
-                    sent = ent.sent
-                    sent_index = sentences.index(sent)
-                    assert sent_index >= 0
-                    # get n_neighbour sentences, clipped to the length of the document
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    sent_doc = doc[start_token:end_token].as_doc()
-                    # currently, the context is the same for each entity in a sentence (should be refined)
-                    xp = self.model.ops.xp
-                    if self.incl_context:
-                        sentence_encoding = self.model.predict([sent_doc])[0]
-                        sentence_encoding_t = sentence_encoding.T
-                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                    entity_count += 1
-                    if ent.label_ in self.labels_discard:
-                        # ignoring this entity - setting to NIL
-                        final_kb_ids.append(self.NIL)
-                    else:
-                        candidates = list(self.get_candidates(self.kb, ent))
-                        if not candidates:
-                            # no prediction possible for this entity - setting to NIL
-                            final_kb_ids.append(self.NIL)
-                        elif len(candidates) == 1:
-                            # shortcut for efficiency reasons: take the 1 candidate
-                            final_kb_ids.append(candidates[0].entity_)
-                        else:
-                            random.shuffle(candidates)
-                            # set all prior probabilities to 0 if incl_prior=False
-                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
-                            if not self.incl_prior:
-                                prior_probs = xp.asarray([0.0 for _ in candidates])
-                            scores = prior_probs
-                            # add in similarity from the context
-                            if self.incl_context:
-                                entity_encodings = xp.asarray(
-                                    [c.entity_vector for c in candidates]
-                                )
-                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
-                                if len(entity_encodings) != len(prior_probs):
-                                    raise RuntimeError(
-                                        Errors.E147.format(
-                                            method="predict",
-                                            msg="vectors not of equal length",
-                                        )
-                                    )
-                                # cosine similarity
-                                sims = xp.dot(entity_encodings, sentence_encoding_t) / (
-                                    sentence_norm * entity_norm
-                                )
-                                if sims.shape != prior_probs.shape:
-                                    raise ValueError(Errors.E161)
-                                scores = prior_probs + sims - (prior_probs * sims)
-                            best_index = scores.argmax().item()
-                            best_candidate = candidates[best_index]
-                            final_kb_ids.append(best_candidate.entity_)
-        if not (len(final_kb_ids) == entity_count):
-            err = Errors.E147.format(
-                method="predict", msg="result variables not of equal length"
-            )
-            raise RuntimeError(err)
-        return final_kb_ids
-
-    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
-        """Modify a batch of documents, using pre-computed scores.
-
-        docs (Iterable[Doc]): The documents to modify.
-        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
-
-        DOCS: https://spacy.io/api/entitylinker#set_annotations
-        """
-        count_ents = len([ent for doc in docs for ent in doc.ents])
-        if count_ents != len(kb_ids):
-            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
-        i = 0
-        overwrite = self.cfg["overwrite"]
-        for doc in docs:
-            for ent in doc.ents:
-                kb_id = kb_ids[i]
-                i += 1
-                for token in ent:
-                    if token.ent_kb_id == 0 or overwrite:
-                        token.ent_kb_id_ = kb_id
-
-    def to_bytes(self, *, exclude=tuple()):
-        """Serialize the pipe to a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
-
-        DOCS: https://spacy.io/api/entitylinker#to_bytes
-        """
-        self._validate_serialization_attrs()
-        serialize = {}
-        if hasattr(self, "cfg") and self.cfg is not None:
-            serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
-        serialize["kb"] = self.kb.to_bytes
-        serialize["model"] = self.model.to_bytes
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, *, exclude=tuple()):
-        """Load the pipe from a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (TrainablePipe): The loaded object.
-
-        DOCS: https://spacy.io/api/entitylinker#from_bytes
-        """
-        self._validate_serialization_attrs()
-
-        def load_model(b):
-            try:
-                self.model.from_bytes(b)
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize = {}
-        if hasattr(self, "cfg") and self.cfg is not None:
-            deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
-        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
-        deserialize["kb"] = lambda b: self.kb.from_bytes(b)
-        deserialize["model"] = load_model
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> None:
-        """Serialize the pipe to disk.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-
-        DOCS: https://spacy.io/api/entitylinker#to_disk
-        """
-        serialize = {}
-        serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
-        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
-        serialize["kb"] = lambda p: self.kb.to_disk(p)
-        serialize["model"] = lambda p: self.model.to_disk(p)
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityLinker_v1":
-        """Load the pipe from disk. Modifies the object in place and returns it.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (EntityLinker): The modified EntityLinker object.
-
-        DOCS: https://spacy.io/api/entitylinker#from_disk
-        """
-
-        def load_model(p):
-            try:
-                with p.open("rb") as infile:
-                    self.model.from_bytes(infile.read())
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize: Dict[str, Callable[[Any], Any]] = {}
-        deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
-        deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
-        deserialize["kb"] = lambda p: self.kb.from_disk(p)
-        deserialize["model"] = load_model
-        util.from_disk(path, deserialize, exclude)
-        return self
-
-    def rehearse(self, examples, *, sgd=None, losses=None, **config):
-        raise NotImplementedError
-
-    def add_label(self, label):
-        raise NotImplementedError
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 80b6e766347..9e955f23e43 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -11,7 +11,6 @@
 from spacy.lang.en import English
 from spacy.ml import load_kb
 from spacy.pipeline import EntityLinker, TrainablePipe
-from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tests.util import make_tempdir
@@ -987,6 +986,8 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
+    from spacy_legacy.components.entity_linker import EntityLinker_v1
+
     # Ensure that the legacy architectures still work
     vector_length = 3
     nlp = English()

From 67a1370e061c29f4e6f331726f04f14bd6067dcf Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 2 Feb 2023 22:13:38 +0900
Subject: [PATCH 050/504] Cleanup/remove backwards compat overwrite settings
 (#11888)

* Remove backwards-compatible overwrite from Entity Linker

This also adds a docstring about overwrite, since it wasn't present.

* Fix docstring

* Remove backward compat settings in Morphologizer

This also needed a docstring added.

For this component it's less clear what the right overwrite settings
are.

* Remove backward compat from sentencizer

This was simple

* Remove backward compat from senter

Another simple one

* Remove backward compat setting from tagger

* Add docstrings

* Update spacy/pipeline/morphologizer.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update docs

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/pipeline/entity_linker.py    | 11 +++--------
 spacy/pipeline/morphologizer.pyx   | 11 ++++-------
 spacy/pipeline/sentencizer.pyx     |  7 ++-----
 spacy/pipeline/senter.pyx          |  5 ++---
 spacy/pipeline/tagger.pyx          |  6 ++----
 website/docs/api/entitylinker.mdx  |  2 +-
 website/docs/api/morphologizer.mdx |  2 +-
 7 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index db4f0e105c1..21e3a279749 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -43,9 +43,6 @@
 
 KNOWLEDGE_BASE_IDS = "kb_ids"
 
-# See #9050
-BACKWARD_OVERWRITE = True
-
 default_model_config = """
 [model]
 @architectures = "spacy.EntityLinker.v2"
@@ -76,8 +73,7 @@
         "entity_vector_length": 64,
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
         "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
-        "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
-        "overwrite": True,
+        "overwrite": False,
         "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
         "use_gold_ents": True,
         "candidates_batch_size": 1,
@@ -211,8 +207,7 @@ def __init__(
         get_candidates_batch: Callable[
             [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
         ],
-        generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
-        overwrite: bool = BACKWARD_OVERWRITE,
+        overwrite: bool = False,
         scorer: Optional[Callable] = entity_linker_score,
         use_gold_ents: bool,
         candidates_batch_size: int,
@@ -236,7 +231,7 @@ def __init__(
             Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
             Iterable[Candidate]]
             ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
-        generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
+        overwrite (bool): Whether to overwrite existing non-empty annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
         use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
             component must provide entity annotations.
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index d3068bdffdd..5e7d0720a40 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -28,10 +28,6 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .tagger import Tagger
 
-# See #9050
-BACKWARD_OVERWRITE = True
-BACKWARD_EXTEND = False
-
 default_model_config = """
 [model]
 @architectures = "spacy.Tagger.v2"
@@ -113,9 +109,8 @@ class Morphologizer(Tagger):
         model: Model,
         name: str = "morphologizer",
         *,
-        overwrite: bool = BACKWARD_OVERWRITE,
-        extend: bool = BACKWARD_EXTEND,
-        label_smoothing: float = 0.0,
+        overwrite: bool = False,
+        extend: bool = False,
         scorer: Optional[Callable] = morphologizer_score,
         save_activations: bool = False,
     ):
@@ -125,6 +120,8 @@ class Morphologizer(Tagger):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
+        overwrite (bool): Whether to overwrite existing annotations.
+        extend (bool): Whether to extend existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attributes "pos" and "morph" and
             Scorer.score_token_attr_per_feat for the attribute "morph".
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 08ba9d989c1..02b92e87812 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -10,10 +10,6 @@ from ..language import Language
 from .pipe import Pipe
 from .senter import senter_score
 
-# see #9050
-BACKWARD_OVERWRITE = False
-
-
 @Language.factory(
     "sentencizer",
     assigns=["token.is_sent_start", "doc.sents"],
@@ -55,13 +51,14 @@ class Sentencizer(Pipe):
         name="sentencizer",
         *,
         punct_chars=None,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
         scorer=senter_score,
     ):
         """Initialize the sentencizer.
 
         punct_chars (list): Punctuation characters to split on. Will be
             serialized with the nlp object.
+        overwrite (bool): Whether to overwrite existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_spans for the attribute "sents".
 
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 185430c122c..ba45df28400 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -20,8 +20,6 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .tagger import Tagger
 
-# See #9050
-BACKWARD_OVERWRITE = False
 
 default_model_config = """
 [model]
@@ -85,7 +83,7 @@ class SentenceRecognizer(Tagger):
         model,
         name="senter",
         *,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
         scorer=senter_score,
         save_activations: bool = False,
     ):
@@ -95,6 +93,7 @@ class SentenceRecognizer(Tagger):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
+        overwrite (bool): Whether to overwrite existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_spans for the attribute "sents".
         save_activations (bool): save model activations in Doc when annotating.
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index a8a89332bd4..8740058174a 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -26,9 +26,6 @@ from .trainable_pipe import TrainablePipe
 
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
-# See #9050
-BACKWARD_OVERWRITE = False
-
 default_model_config = """
 [model]
 @architectures = "spacy.Tagger.v2"
@@ -98,7 +95,7 @@ class Tagger(TrainablePipe):
         model,
         name="tagger",
         *,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
         scorer=tagger_score,
         neg_prefix="!",
         save_activations: bool = False,
@@ -109,6 +106,7 @@ class Tagger(TrainablePipe):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
+        overwrite (bool): Whether to overwrite existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attribute "tag".
         save_activations (bool): save model activations in Doc when annotating.
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index 238b62a2e6d..12b2f6bef1d 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -63,7 +63,7 @@ architectures and their arguments and hyperparameters.
 | `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
 | `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
 | `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                    |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                   |
 | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
 | `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 4660ec312fa..9514bc773b9 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters.
 | Setting                                         | Description                                                                                                                                                                                                                                                            |
 | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `model`                                         | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                 |
 | `extend` <Tag variant="new">3.2</Tag>           | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
 | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                       |

From 1d8778fdf78d50e09ee1265439addeed7985542f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 3 Feb 2023 15:22:25 +0100
Subject: [PATCH 051/504] `Language.update`: ensure that tok2vec gets updated
 (#12136)

* `Language.update`: ensure that tok2vec gets updated

The components in a pipeline can be updated independently. However,
tok2vec implementations are an exception to this, since they depend on
listeners for their gradients. The update method of a tok2vec
implementation computes the tok2vec forward and passes this along with a
backprop function to the listeners. This backprop function accumulates
gradients for all the listeners. There are two ways in which the
accumulated gradients can be used to update the tok2vec weights:

1. Call the `finish_update` method of tok2vec *after* the `update`
   method is called on all of the pipes that use a tok2vec listener.
2. Pass an optimizer to the `update` method of tok2vec. In this
   case, tok2vec will give the last listener a special backprop
   function that calls `finish_update` on the tok2vec.

Unfortunately, `Language.update` did neither of these. Instead, it
immediately called `finish_update` on every pipe after `update`. As a
result, the tok2vec weights are updated when no gradients have been
accumulated from listeners yet. And the gradients of the listeners are
only used in the next call to `Language.update` (when `finish_update` is
called on tok2vec again).

This change fixes this issue by passing the optimizer to the `update`
method of trainable pipes, leading to use of the second strategy
outlined above.

The main updating loop in `Language.update` is also simplified by using
the `TrainableComponent` protocol consistently.

* Train loop: `sgd` is `Optional[Optimizer]`, do not pass false

* Language.update: call pipe finish_update after all pipe updates

This does correct and fast updates if multiple components update the
same parameters.

* Add comment why we moved `finish_update` to a separate loop
---
 spacy/language.py                             | 28 ++++---
 .../pipeline/test_annotates_on_update.py      | 12 ++-
 spacy/tests/test_language.py                  | 73 ++++++++++++++++++-
 spacy/training/loop.py                        |  2 +-
 4 files changed, 99 insertions(+), 16 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index cb9652e97bf..51189ab371a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1280,17 +1280,12 @@ def update(
             component_cfg[name].setdefault("drop", drop)
             pipe_kwargs[name].setdefault("batch_size", self.batch_size)
         for name, proc in self.pipeline:
-            # ignore statements are used here because mypy ignores hasattr
-            if name not in exclude and hasattr(proc, "update"):
-                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])  # type: ignore
-            if sgd not in (None, False):
-                if (
-                    name not in exclude
-                    and isinstance(proc, ty.TrainableComponent)
-                    and proc.is_trainable
-                    and proc.model not in (True, False, None)
-                ):
-                    proc.finish_update(sgd)
+            if (
+                name not in exclude
+                and isinstance(proc, ty.TrainableComponent)
+                and proc.is_trainable
+            ):
+                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
             if name in annotates:
                 for doc, eg in zip(
                     _pipe(
@@ -1303,6 +1298,17 @@ def update(
                     examples,
                 ):
                     eg.predicted = doc
+        # Only finish the update after all component updates are done. Some
+        # components may share weights (such as tok2vec) and we only want
+        # to apply weight updates after all gradients are accumulated.
+        for name, proc in self.pipeline:
+            if (
+                name not in exclude
+                and isinstance(proc, ty.TrainableComponent)
+                and proc.is_trainable
+            ):
+                proc.finish_update(sgd)
+
         return losses
 
     def rehearse(
diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py
index d4feebd3045..f13a0ae5a3c 100644
--- a/spacy/tests/pipeline/test_annotates_on_update.py
+++ b/spacy/tests/pipeline/test_annotates_on_update.py
@@ -55,9 +55,11 @@ def assert_sents(nlp, name):
         return AssertSents(name)
 
     class AssertSents:
+        model = None
+        is_trainable = True
+
         def __init__(self, name, **cfg):
             self.name = name
-            pass
 
         def __call__(self, doc):
             if not doc.has_annotation("SENT_START"):
@@ -65,10 +67,16 @@ def __call__(self, doc):
             return doc
 
         def update(self, examples, *, drop=0.0, sgd=None, losses=None):
+            losses.setdefault(self.name, 0.0)
+
             for example in examples:
                 if not example.predicted.has_annotation("SENT_START"):
                     raise ValueError("No sents")
-            return {}
+
+            return losses
+
+        def finish_update(self, sgd=None):
+            pass
 
     nlp = English()
     nlp.add_pipe("sentencizer")
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index b419d77b51d..88ef3d434c0 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -13,8 +13,12 @@
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, Span
 from spacy.training import Example
-from spacy.util import find_matching_language, ignore_error, raise_error, registry
-from spacy.vocab import Vocab
+from spacy.lang.en import English
+from spacy.lang.de import German
+from spacy.util import registry, ignore_error, raise_error, find_matching_language
+from spacy.util import load_model_from_config
+import spacy
+from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
 
 from .util import add_vecs_to_vocab, assert_docs_equal
 
@@ -27,6 +31,51 @@
 except ImportError:
     pass
 
+TAGGER_CFG_STRING = """
+    [nlp]
+    lang = "en"
+    pipeline = ["tok2vec","tagger"]
+
+    [components]
+
+    [components.tagger]
+    factory = "tagger"
+
+    [components.tagger.model]
+    @architectures = "spacy.Tagger.v2"
+    nO = null
+
+    [components.tagger.model.tok2vec]
+    @architectures = "spacy.Tok2VecListener.v1"
+    width = ${components.tok2vec.model.encode.width}
+
+    [components.tok2vec]
+    factory = "tok2vec"
+
+    [components.tok2vec.model]
+    @architectures = "spacy.Tok2Vec.v2"
+
+    [components.tok2vec.model.embed]
+    @architectures = "spacy.MultiHashEmbed.v1"
+    width = ${components.tok2vec.model.encode.width}
+    rows = [2000, 1000, 1000, 1000]
+    attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+    include_static_vectors = false
+
+    [components.tok2vec.model.encode]
+    @architectures = "spacy.MaxoutWindowEncoder.v2"
+    width = 96
+    depth = 4
+    window_size = 1
+    maxout_pieces = 3
+    """
+
+
+TAGGER_TRAIN_DATA = [
+    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
 
 TAGGER_TRAIN_DATA = [
     ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
@@ -93,6 +142,26 @@ def test_language_update(nlp):
         example = Example.from_dict(doc, wrongkeyannots)
 
 
+def test_language_update_updates():
+    config = Config().from_str(TAGGER_CFG_STRING)
+    nlp = load_model_from_config(config, auto_fill=True, validate=True)
+
+    train_examples = []
+    for t in TAGGER_TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+    nlp.update(train_examples, sgd=optimizer)
+    docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+
+    xp = get_array_module(docs_after_update[0].tensor)
+    assert xp.any(
+        xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
+    )
+
+
 def test_language_evaluate(nlp):
     text = "hello world"
     annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 58d5b06786f..e6b3451cd73 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -221,7 +221,7 @@ def train_while_improving(
                 subbatch,
                 drop=dropout,
                 losses=losses,
-                sgd=False,  # type: ignore[arg-type]
+                sgd=None,
                 exclude=exclude,
                 annotates=annotating_components,
             )

From ca1aa0e8afff28f3b6f0f34c7601706a6b2cca4e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:28:34 +0100
Subject: [PATCH 052/504] Use the same tuple in Span cmp and hash (#12251)

---
 spacy/tokens/span.pyx | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index bf37f955d98..7da47616489 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -133,9 +133,8 @@ cdef class Span:
             else:
                 return True
 
-        cdef SpanC* span_c = self.span_c()
-        cdef SpanC* other_span_c = other.span_c()
-
+        self_tuple = self._cmp_tuple()
+        other_tuple = other._cmp_tuple()
         # <
         if op == 0:
             return span_c.start_char < other_span_c.start_char
@@ -170,8 +169,20 @@ cdef class Span:
             return span_c.start_char >= other_span_c.start_char
 
     def __hash__(self):
+        return hash(self._cmp_tuple())
+
+    def _cmp_tuple(self):
         cdef SpanC* span_c = self.span_c()
-        return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id))
+        return (
+            span_c.start_char,
+            span_c.end_char,
+            span_c.start,
+            span_c.end,
+            span_c.label,
+            span_c.kb_id,
+            span_c.id,
+            self.doc,
+        )
 
     def __len__(self):
         """Get the number of tokens in the span.

From 41053158bb7338f6826b7f955a03fa69a7c2d771 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:37:42 +0100
Subject: [PATCH 053/504] Remove names for vectors (#12243)

* Remove names for vectors

Named vectors are basically a carry-over from v2 and aren't used for
anything.

* Format
---
 spacy/cli/init_pipeline.py                    |  2 --
 spacy/language.py                             | 14 +----------
 .../serialize/test_serialize_pipeline.py      |  2 +-
 spacy/tests/vocab_vectors/test_vectors.py     | 13 +++++-----
 spacy/training/initialize.py                  |  7 ------
 spacy/vectors.pyx                             |  5 +---
 spacy/vocab.pyi                               |  4 ++--
 spacy/vocab.pyx                               | 24 ++++++-------------
 website/docs/api/cli.mdx                      |  6 ++---
 website/docs/api/vectors.mdx                  |  1 -
 website/docs/api/vocab.mdx                    |  1 -
 11 files changed, 20 insertions(+), 59 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 0ff39d2145b..1a044dedbc9 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -21,7 +21,6 @@ def init_vectors_cli(
     prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
     truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
     mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
-    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
     attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
@@ -45,7 +44,6 @@ def init_vectors_cli(
         vectors_loc,
         truncate=truncate,
         prune=prune,
-        name=name,
         mode=mode,
         attr=attr,
     )
diff --git a/spacy/language.py b/spacy/language.py
index 51189ab371a..e8a7d719ef2 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -194,12 +194,7 @@ def __init__(
         if not isinstance(vocab, Vocab) and vocab is not True:
             raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
         if vocab is True:
-            vectors_name = meta.get("vectors", {}).get("name")
-            vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
-            if not create_vectors:
-                vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
-                create_vectors = registry.resolve(vectors_cfg)["vectors"]
-            vocab.vectors = create_vectors(vocab)
+            vocab = create_vocab(self.lang, self.Defaults)
         else:
             if (self.lang and vocab.lang) and (self.lang != vocab.lang):
                 raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@@ -253,7 +248,6 @@ def meta(self) -> Dict[str, Any]:
             "width": self.vocab.vectors_length,
             "vectors": len(self.vocab.vectors),
             "keys": self.vocab.vectors.n_keys,
-            "name": self.vocab.vectors.name,
             "mode": self.vocab.vectors.mode,
         }
         self._meta["labels"] = dict(self.pipe_labels)
@@ -2275,9 +2269,6 @@ def deserialize_meta(path: Path) -> None:
             if path.exists():
                 data = srsly.read_json(path)
                 self.meta.update(data)
-                # self.meta always overrides meta["vectors"] with the metadata
-                # from self.vocab.vectors, so set the name directly
-                self.vocab.vectors.name = data.get("vectors", {}).get("name")
 
         def deserialize_vocab(path: Path) -> None:
             if path.exists():
@@ -2346,9 +2337,6 @@ def from_bytes(
         def deserialize_meta(b):
             data = srsly.json_loads(b)
             self.meta.update(data)
-            # self.meta always overrides meta["vectors"] with the metadata
-            # from self.vocab.vectors, so set the name directly
-            self.vocab.vectors.name = data.get("vectors", {}).get("name")
 
         deserializers: Dict[str, Callable[[bytes], Any]] = {}
         deserializers["config.cfg"] = lambda b: self.config.from_bytes(
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 8170488f758..39fbbf58217 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -181,7 +181,7 @@ def test_issue4042_bug2():
 @pytest.mark.issue(4725)
 def test_issue4725_1():
     """Ensure the pickling of the NER goes well"""
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    vocab = Vocab()
     nlp = English(vocab=vocab)
     config = {
         "update_with_oracle_cut_size": 111,
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 7172913141c..16574656bfb 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -84,7 +84,7 @@ def test_issue1539():
 @pytest.mark.issue(1807)
 def test_issue1807():
     """Test vocab.set_vector also adds the word to the vocab."""
-    vocab = Vocab(vectors_name="test_issue1807")
+    vocab = Vocab()
     assert "hello" not in vocab
     vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
     assert "hello" in vocab
@@ -94,13 +94,12 @@ def test_issue1807():
 def test_issue2871():
     """Test that vectors recover the correct key for spaCy reserved words."""
     words = ["dog", "cat", "SUFFIX"]
-    vocab = Vocab(vectors_name="test_issue2871")
+    vocab = Vocab()
     vocab.vectors.resize(shape=(3, 10))
     vector_data = numpy.zeros((3, 10), dtype="f")
     for word in words:
         _ = vocab[word]  # noqa: F841
         vocab.set_vector(word, vector_data[0])
-    vocab.vectors.name = "dummy_vectors"
     assert vocab["dog"].rank == 0
     assert vocab["cat"].rank == 1
     assert vocab["SUFFIX"].rank == 2
@@ -125,7 +124,7 @@ def test_issue4725_2():
         # ensures that this runs correctly and doesn't hang or crash because of the global vectors
         # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
         # or because of issues with pickling the NER (cf test_issue4725_1)
-        vocab = Vocab(vectors_name="test_vocab_add_vector")
+        vocab = Vocab()
         data = numpy.ndarray((5, 3), dtype="f")
         data[0] = 1.0
         data[1] = 2.0
@@ -340,7 +339,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2):
 
 
 def test_vocab_add_vector():
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    vocab = Vocab()
     data = OPS.xp.ndarray((5, 3), dtype="f")
     data[0] = 1.0
     data[1] = 2.0
@@ -356,7 +355,7 @@ def test_vocab_add_vector():
 
 
 def test_vocab_prune_vectors():
-    vocab = Vocab(vectors_name="test_vocab_prune_vectors")
+    vocab = Vocab()
     _ = vocab["cat"]  # noqa: F841
     _ = vocab["dog"]  # noqa: F841
     _ = vocab["kitten"]  # noqa: F841
@@ -406,7 +405,7 @@ def test_vectors_serialize():
 
 
 def test_vector_is_oov():
-    vocab = Vocab(vectors_name="test_vocab_is_oov")
+    vocab = Vocab()
     data = OPS.xp.ndarray((5, 3), dtype="f")
     data[0] = 1.0
     data[1] = 2.0
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 0621702214c..191821e786e 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -214,7 +214,6 @@ def convert_vectors(
     *,
     truncate: int,
     prune: int,
-    name: Optional[str] = None,
     mode: str = VectorsMode.default,
     attr: str = "ORTH",
 ) -> None:
@@ -262,12 +261,6 @@ def convert_vectors(
                     attr=attr,
                 )
                 nlp.vocab.deduplicate_vectors()
-    if name is None:
-        # TODO: Is this correct? Does this matter?
-        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
-    else:
-        nlp.vocab.vectors.name = name
-    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
     if prune >= 1 and mode != VectorsMode.floret:
         nlp.vocab.prune_vectors(prune)
 
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 6ff99bb59eb..e16efd2738d 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -124,7 +124,6 @@ cdef class Vectors(BaseVectors):
     DOCS: https://spacy.io/api/vectors
     """
     cdef public object strings
-    cdef public object name
     cdef readonly object mode
     cdef public object data
     cdef public object key2row
@@ -137,14 +136,13 @@ cdef class Vectors(BaseVectors):
     cdef readonly unicode eow
     cdef readonly attr_id_t attr
 
-    def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">", attr="ORTH"):
+    def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
         """Create a new vector store.
 
         strings (StringStore): The string store.
         shape (tuple): Size of the table, as (# entries, # columns)
         data (numpy.ndarray or cupy.ndarray): The vector data.
         keys (iterable): A sequence of keys, aligned with the data.
-        name (str): A name to identify the vectors table.
         mode (str): Vectors mode: "default" or "floret" (default: "default").
         minn (int): The floret char ngram minn (default: 0).
         maxn (int): The floret char ngram maxn (default: 0).
@@ -160,7 +158,6 @@ cdef class Vectors(BaseVectors):
         self.strings = strings
         if self.strings is None:
             self.strings = StringStore()
-        self.name = name
         if mode not in Mode.values():
             raise ValueError(
                 Errors.E202.format(
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
index 7f5f23e7847..7fbb9764f10 100644
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@@ -12,7 +12,8 @@ from .tokens import Doc, Span
 from .vectors import Vectors
 
 def create_vocab(
-    lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
+    lang: Optional[str],
+    defaults: Any,
 ) -> Vocab: ...
 
 class Vocab:
@@ -29,7 +30,6 @@ class Vocab:
         strings: Optional[Union[List[str], StringStore]] = ...,
         lookups: Optional[Lookups] = ...,
         oov_prob: float = ...,
-        vectors_name: Optional[str] = ...,
         writing_system: Dict[str, Any] = ...,
         get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
     ) -> None: ...
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 8ac1215dead..3145f51844a 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -20,7 +20,7 @@ from .vectors import Mode as VectorsMode
 from .vectors import Vectors
 
 
-def create_vocab(lang, defaults, vectors_name=None):
+def create_vocab(lang, defaults):
     # If the spacy-lookups-data package is installed, we pre-populate the lookups
     # with lexeme data, if available
     lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
@@ -36,7 +36,6 @@ def create_vocab(lang, defaults, vectors_name=None):
         lex_attr_getters=lex_attrs,
         writing_system=defaults.writing_system,
         get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
-        vectors_name=vectors_name,
     )
 
 
@@ -47,17 +46,9 @@ cdef class Vocab:
 
     DOCS: https://spacy.io/api/vocab
     """
-    def __init__(
-        self,
-        lex_attr_getters=None,
-        strings=tuple(),
-        lookups=None,
-        oov_prob=-20.,
-        vectors_name=None,
-        writing_system={},  # no-cython-lint
-        get_noun_chunks=None,
-        **deprecated_kwargs
-    ):
+    def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
+                 oov_prob=-20., writing_system={}, get_noun_chunks=None,
+                 **deprecated_kwargs):
         """Create the vocabulary.
 
         lex_attr_getters (dict): A dictionary mapping attribute IDs to
@@ -66,7 +57,6 @@ cdef class Vocab:
             vice versa.
         lookups (Lookups): Container for large lookup tables and dictionaries.
         oov_prob (float): Default OOV probability.
-        vectors_name (str): Optional name to identify the vectors table.
         get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]):
             A function that yields base noun phrases used for Doc.noun_chunks.
         """
@@ -83,7 +73,7 @@ cdef class Vocab:
                 _ = self[string]
         self.lex_attr_getters = lex_attr_getters
         self.morphology = Morphology(self.strings)
-        self.vectors = Vectors(strings=self.strings, name=vectors_name)
+        self.vectors = Vectors(strings=self.strings)
         self.lookups = lookups
         self.writing_system = writing_system
         self.get_noun_chunks = get_noun_chunks
@@ -320,7 +310,7 @@ cdef class Vocab:
             for key, row in self.vectors.key2row.items()
         }
         # replace vectors with deduplicated version
-        self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name)
+        self.vectors = Vectors(strings=self.strings, data=data)
         for key, row in key2row.items():
             self.vectors.add(key, row=row)
 
@@ -377,7 +367,7 @@ cdef class Vocab:
         keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
         keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
         toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
-        self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name)
+        self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row])
         syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
         syn_keys = ops.to_numpy(syn_keys)
         remap = {}
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index acc2ce1caa2..3f91e1ff71e 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -225,7 +225,7 @@ This functionality was previously available as part of the command `init-model`.
 </Infobox>
 
 ```bash
-$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
+$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--verbose]
 ```
 
 | Name               | Description                                                                                                                                                                                                                                                         |
@@ -235,9 +235,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
 | `--prune`, `-p`    | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
-| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~str \(option)~~                                                                                                                                                |
-| `--attr`, `-a`     | Token attribute to use for vectors, e.g. `LOWER` or `NORM`) Defaults to `ORTH`. ~~str \(option)~~                                                                                                                                                                   |
-| `--name`, `-n`     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
+| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~                                                                                                                                      |
 | `--verbose`, `-V`  | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                                                                      |
 | `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
 | **CREATES**        | A spaCy pipeline directory containing the vocab and vectors.                                                                                                                                                                                                        |
diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx
index 0e92eb12ba4..39b309e1377 100644
--- a/website/docs/api/vectors.mdx
+++ b/website/docs/api/vectors.mdx
@@ -52,7 +52,6 @@ modified later.
 | `shape`                                   | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
 | `data`                                    | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~                                                                                                                              |
 | `keys`                                    | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~                                                                                                                |
-| `name`                                    | A name to identify the vectors table. ~~str~~                                                                                                                                          |
 | `mode` <Tag variant="new">3.2</Tag>       | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~                                                                         |
 | `minn` <Tag variant="new">3.2</Tag>       | The floret char ngram minn (default: `0`). ~~int~~                                                                                                                                     |
 | `maxn` <Tag variant="new">3.2</Tag>       | The floret char ngram maxn (default: `0`). ~~int~~                                                                                                                                     |
diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx
index 57618397da5..36369c78427 100644
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@@ -34,7 +34,6 @@ Create the vocabulary.
 | `strings`          | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~                           |
 | `lookups`          | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~                                      |
 | `oov_prob`         | The default OOV probability. Defaults to `-20.0`. ~~float~~                                                                                                             |
-| `vectors_name`     | A name to identify the vectors table. ~~str~~                                                                                                                           |
 | `writing_system`   | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~                          |
 | `get_noun_chunks`  | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
 

From 9524de6295dcc6b1b2be8f1b3887be3928ee8fd8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:46:07 +0100
Subject: [PATCH 054/504] Remove unused Span.char_span(id=) (#12250)

---
 spacy/tokens/span.pyi     | 1 -
 spacy/tokens/span.pyx     | 3 +--
 website/docs/api/span.mdx | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index ae4a6209e7e..373b4ed1afe 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -103,7 +103,6 @@ class Span:
         label: Union[int, str] = ...,
         kb_id: Union[int, str] = ...,
         vector: Optional[Floats1d] = ...,
-        id: Union[int, str] = ...,
         alignment_mode: str = ...,
         span_id: Union[int, str] = ...,
     ) -> Span: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 7da47616489..3f8630c638e 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -694,7 +694,7 @@ cdef class Span:
         else:
             return self.doc[root]
 
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
         """Create a `Span` object from the slice `span.text[start : end]`.
 
         start (int): The index of the first character of the span.
@@ -704,7 +704,6 @@ cdef class Span:
         kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a named entity.
         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
             the span.
-        id (Union[int, str]): Unused.
         alignment_mode (str): How character indices are aligned to token
             boundaries. Options: "strict" (character indices must be aligned
             with token boundaries), "contract" (span of all tokens completely
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index 1774a298ff2..fa5791c405e 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -193,7 +193,6 @@ the character indices don't map to a valid span.
 | `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
 | `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
 | `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
-| `id`                                            | Unused. ~~Union[int, str]~~                                                                                                                                                                                                                                                  |
 | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
 | `span_id` <Tag variant="new">3.5.1</Tag>        | An identifier to associate with the span. ~~Union[int, str]~~                                                                                                                                                                                                                |
 | **RETURNS**                                     | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |

From 66269b273605e3d3ad6f5bfa968c33ea80b117fe Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 15 Feb 2023 12:34:33 +0100
Subject: [PATCH 055/504] Make Span.char_span optional args keyword-only
 (#12257)

* Make Span.char_span optional args keyword-only

* Make kb_id and following kw-only

* Format
---
 spacy/tokens/doc.pyi      | 3 ++-
 spacy/tokens/doc.pyx      | 4 ++--
 spacy/tokens/span.pyi     | 1 +
 spacy/tokens/span.pyx     | 6 +++---
 website/docs/api/doc.mdx  | 1 +
 website/docs/api/span.mdx | 5 +++--
 6 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 97c3f69f430..11f8a1c5eb8 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -124,6 +124,7 @@ class Doc:
         start_idx: int,
         end_idx: int,
         label: Union[int, str] = ...,
+        *,
         kb_id: Union[int, str] = ...,
         vector: Optional[Floats1d] = ...,
         alignment_mode: str = ...,
@@ -151,7 +152,7 @@ class Doc:
         blocked: Optional[List[Span]] = ...,
         missing: Optional[List[Span]] = ...,
         outside: Optional[List[Span]] = ...,
-        default: str = ...
+        default: str = ...,
     ) -> None: ...
     @property
     def noun_chunks(self) -> Iterator[Span]: ...
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 56ee216d17f..79bb965bb3c 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -556,7 +556,7 @@ cdef class Doc:
     def doc(self):
         return self
 
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
         """Create a `Span` object from the slice
         `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
         created.
@@ -1647,7 +1647,7 @@ cdef class Doc:
         for span_group in doc_json.get("spans", {}):
             spans = []
             for span in doc_json["spans"][span_group]:
-                char_span = self.char_span(span["start"], span["end"], span["label"], span["kb_id"])
+                char_span = self.char_span(span["start"], span["end"], span["label"], kb_id=span["kb_id"])
                 if char_span is None:
                     raise ValueError(Errors.E1039.format(obj="span", start=span["start"], end=span["end"]))
                 spans.append(char_span)
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 373b4ed1afe..3c85542bb3d 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -101,6 +101,7 @@ class Span:
         start_idx: int,
         end_idx: int,
         label: Union[int, str] = ...,
+        *,
         kb_id: Union[int, str] = ...,
         vector: Optional[Floats1d] = ...,
         alignment_mode: str = ...,
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 3f8630c638e..883a67f3dd6 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -694,11 +694,11 @@ cdef class Span:
         else:
             return self.doc[root]
 
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
         """Create a `Span` object from the slice `span.text[start : end]`.
 
-        start (int): The index of the first character of the span.
-        end (int): The index of the first character after the span.
+        start_idx (int): The index of the first character of the span.
+        end_idx (int): The index of the first character after the span.
         label (Union[int, str]): A label to attach to the Span, e.g. for
             named entities.
         kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a named entity.
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index 28757cbc45f..f53e209afc8 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -214,6 +214,7 @@ alignment mode `"strict".
 | `start`                                  | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
 | `end`                                    | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
 | `label`                                  | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| _keyword-only_                           |                                                                                                                                                                                                                                                                              |
 | `kb_id`                                  | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
 | `vector`                                 | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
 | `alignment_mode`                         | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index fa5791c405e..ae7ef7203b6 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -188,9 +188,10 @@ the character indices don't map to a valid span.
 
 | Name                                            | Description                                                                                                                                                                                                                                                                  |
 | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`                                         | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
-| `end`                                           | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
+| `start_idx`                                     | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
+| `end_idx`                                       | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
 | `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| _keyword-only_                                  |                                                                                                                                                                                                                                                                              |
 | `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
 | `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
 | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |

From ee1b9bb8f2480f0cd89d30be0bd9b85d8b3470e4 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 16 Feb 2023 19:08:55 +0900
Subject: [PATCH 056/504] Use tempfile.TemporaryDirectory (#12285)

---
 spacy/util.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index 8c402a74ce9..7448da8ded0 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1056,22 +1056,11 @@ def make_tempdir() -> Generator[Path, None, None]:
     its contents at the end of the with block.
     YIELDS (Path): The path of the temp directory.
     """
-    d = Path(tempfile.mkdtemp())
-    yield d
-
-    # On Windows, git clones use read-only files, which cause permission errors
-    # when being deleted. This forcibly fixes permissions.
-    def force_remove(rmfunc, path, ex):
-        os.chmod(path, stat.S_IWRITE)
-        rmfunc(path)
-
     try:
-        if sys.version_info >= (3, 12):
-            shutil.rmtree(str(d), onexc=force_remove)
-        else:
-            shutil.rmtree(str(d), onerror=force_remove)
+        with tempfile.TemporaryDirectory() as td:
+            yield Path(td)
     except PermissionError as e:
-        warnings.warn(Warnings.W091.format(dir=d, msg=e))
+        warnings.warn(Warnings.W091.format(dir=td, msg=e))
 
 
 def is_in_jupyter() -> bool:

From 5b344752558fdbcaf940fe141bd9bef9ed1aae01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 21 Feb 2023 15:47:18 +0100
Subject: [PATCH 057/504] Reimplement distillation with oracle cut size
 (#12214)

* Improve the correctness of _parse_patch

* If there are no more actions, do not attempt to make further
  transitions, even if not all states are final.
* Assert that the number of actions for a step is the same as
  the number of states.

* Reimplement distillation with oracle cut size

The code for distillation with an oracle cut size was not reimplemented
after the parser refactor. We did not notice, because we did not have
tests for this functionality. This change brings back the functionality
and adds this to the parser tests.

* Rename states2actions to _states_to_actions for consistency

* Test distillation max cuts in NER

* Mark parser/NER tests as slow

* Typo

* Fix invariant in _states_diff_to_actions

* Rename _init_batch -> _init_batch_from_teacher

* Ninja edit the ninja edit

* Check that we raise an exception when we pass the incorrect number or actions

* Remove unnecessary get

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Write out condition more explicitly

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/ml/tb_framework.pyx            |   4 +-
 spacy/pipeline/transition_parser.pyx | 101 +++++++++++++++++++++------
 spacy/tests/parser/test_model.py     |  61 ++++++++++++++++
 spacy/tests/parser/test_ner.py       |   5 +-
 spacy/tests/parser/test_parse.py     |   5 +-
 5 files changed, 152 insertions(+), 24 deletions(-)
 create mode 100644 spacy/tests/parser/test_model.py

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 79be13b00bd..9b2114900d3 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
     cdef np.ndarray step_actions
 
     scores = []
-    while sizes.states >= 1:
+    while sizes.states >= 1 and (actions is None or len(actions) > 0):
         step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
         step_actions = actions[0] if actions is not None else None
+        assert step_actions is None or step_actions.size == sizes.states, \
+            f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
         with nogil:
             _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
             if actions is None:
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 6a50dbacaeb..ef2e3314e85 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -45,6 +45,11 @@ from ..errors import Errors
 from ..training import validate_examples, validate_get_examples
 from ._parser_internals import _beam_utils
 
+# TODO: Remove when we switch to Cython 3.
+cdef extern from "<algorithm>" namespace "std" nogil:
+    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
+
+
 NUMPY_OPS = NumpyOps()
 
 
@@ -262,8 +267,8 @@ class Parser(TrainablePipe):
             # batch uniform length. Since we do not have a gold standard
             # sequence, we use the teacher's predictions as the gold
             # standard.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states = self._init_batch(teacher_pipe, student_docs, max_moves)
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
+            states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
@@ -274,12 +279,12 @@ class Parser(TrainablePipe):
         # gradients of the student's transition distributions relative to the
         # teacher's distributions.
 
-        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
-            max_moves=max_moves)
+        student_inputs = TransitionModelInputs(docs=student_docs,
+            states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
+        actions = _states_diff_to_actions(states, student_states)
         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-            moves=self.moves, actions=actions)
+            states=states, moves=teacher_pipe.moves, actions=actions)
         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@@ -528,7 +533,7 @@ class Parser(TrainablePipe):
         set_dropout_rate(self.model, 0.0)
         student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
+        actions = _states_to_actions(student_states)
         teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
         _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
 
@@ -648,7 +653,7 @@ class Parser(TrainablePipe):
                     raise ValueError(Errors.E149) from None
         return self
 
-    def _init_batch(self, teacher_step_model, docs, max_length):
+    def _init_batch_from_teacher(self, teacher_pipe, docs, max_length):
         """Make a square batch of length equal to the shortest transition
         sequence or a cap. A long
         doc will get multiple states. Let's say we have a doc of length 2*N,
@@ -657,10 +662,12 @@ class Parser(TrainablePipe):
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
         cdef:
-            StateClass start_state
             StateClass state
-            Transition action
-        all_states = self.moves.init_batch(docs)
+            TransitionSystem moves = teacher_pipe.moves
+
+        # Start with the same heuristic as in supervised training: exclude
+        # docs that are within the maximum length.
+        all_states = moves.init_batch(docs)
         states = []
         to_cut = []
         for state, doc in zip(all_states, docs):
@@ -669,18 +676,28 @@ class Parser(TrainablePipe):
                     states.append(state)
                 else:
                     to_cut.append(state)
+
+        if not to_cut:
+            return states
+
+        # Parse the states that are too long with the teacher's parsing model.
+        teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
+            states=[state.copy() for state in to_cut])
+        (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs)
+
+        # Step through the teacher's actions and store every state after
+        # each multiple of max_length.
+        teacher_actions = _states_to_actions(teacher_states)
         while to_cut:
             states.extend(state.copy() for state in to_cut)
-            # Move states forward max_length actions.
-            length = 0
-            while to_cut and length < max_length:
-                teacher_scores = teacher_step_model.predict(to_cut)
-                self.transition_states(to_cut, teacher_scores)
-                # States that are completed do not need further cutting.
-                to_cut = [state for state in to_cut if not state.is_final()]
-                length += 1
-        return states
+            for step_actions in teacher_actions[:max_length]:
+                to_cut = moves.apply_actions(to_cut, step_actions)
+            teacher_actions = teacher_actions[max_length:]
+
+            if len(teacher_actions) < max_length:
+                break
 
+        return states
 
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
@@ -742,7 +759,7 @@ def _change_attrs(model, **kwargs):
             model.attrs[key] = value
 
 
-def states2actions(states: List[StateClass]) -> List[Ints1d]:
+def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
     cdef int step
     cdef StateClass state
     cdef StateC* c_state
@@ -763,3 +780,45 @@ def states2actions(states: List[StateClass]) -> List[Ints1d]:
         actions.append(numpy.array(step_actions, dtype="i"))
 
     return actions
+
+def _states_diff_to_actions(
+    before_states: List[StateClass],
+    after_states: List[StateClass]
+) -> List[Ints1d]:
+    """
+    Return for two sets of states the actions to go from the first set of
+    states to the second set of states. The histories of the first set of
+    states must be a prefix of the second set of states.
+    """
+    cdef StateClass before_state, after_state
+    cdef StateC* c_state_before
+    cdef StateC* c_state_after
+
+    assert len(before_states) == len(after_states)
+
+    # Check invariant: before states histories must be prefixes of after states.
+    for before_state, after_state in zip(before_states, after_states):
+        c_state_before = before_state.c
+        c_state_after = after_state.c
+
+        assert equal(c_state_before.history.begin(), c_state_before.history.end(),
+            c_state_after.history.begin())
+
+    actions = []
+    while True:
+        step = len(actions)
+
+        step_actions = []
+        for before_state, after_state in zip(before_states, after_states):
+            c_state_before = before_state.c
+            c_state_after = after_state.c
+            if step < c_state_after.history.size() - c_state_before.history.size():
+                step_actions.append(c_state_after.history[c_state_before.history.size() + step])
+
+        # We are done if we have exhausted all histories.
+        if len(step_actions) == 0:
+            break
+
+        actions.append(numpy.array(step_actions, dtype="i"))
+
+    return actions
diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py
new file mode 100644
index 00000000000..8c1cf7a9346
--- /dev/null
+++ b/spacy/tests/parser/test_model.py
@@ -0,0 +1,61 @@
+import numpy
+import pytest
+
+from spacy.lang.en import English
+from spacy.ml.tb_framework import TransitionModelInputs
+from spacy.training import Example
+
+TRAIN_DATA = [
+    (
+        "They trade mortgage-backed securities.",
+        {
+            "heads": [1, 1, 4, 4, 5, 1, 1],
+            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
+        },
+    ),
+    (
+        "I like London and Berlin.",
+        {
+            "heads": [1, 1, 1, 2, 2, 1],
+            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
+        },
+    ),
+]
+
+
+@pytest.fixture
+def nlp_parser():
+    nlp = English()
+    parser = nlp.add_pipe("parser")
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        for dep in annotations["deps"]:
+            parser.add_label(dep)
+    nlp.initialize()
+
+    return nlp, parser
+
+
+def test_incorrect_number_of_actions(nlp_parser):
+    nlp, parser = nlp_parser
+    doc = nlp.make_doc("test")
+
+    # Too many actions for the number of docs
+    with pytest.raises(AssertionError):
+        parser.model.predict(
+            TransitionModelInputs(
+                docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
+            )
+        )
+
+    # Too few actions for the number of docs
+    with pytest.raises(AssertionError):
+        parser.model.predict(
+            TransitionModelInputs(
+                docs=[doc, doc],
+                moves=parser.moves,
+                actions=[numpy.array([0], dtype="i")],
+            )
+        )
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index b2c39ae88bc..2c520b7daf6 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -624,7 +624,9 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -642,6 +644,7 @@ def test_distill():
 
     student = English()
     student_ner = student.add_pipe("ner")
+    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index a6e1852514d..4c709932bb1 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -460,7 +460,9 @@ def test_is_distillable():
     assert parser.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []
@@ -478,6 +480,7 @@ def test_distill():
 
     student = English()
     student_parser = student.add_pipe("parser")
+    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
     student_parser.initialize(
         get_examples=lambda: train_examples, labels=teacher_parser.label_data
     )

From c3b960642d22a4d4e5c6488588cd77086059d83a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 23 Feb 2023 11:36:50 +0100
Subject: [PATCH 058/504] Remove backoff from .vector to .tensor (#12292)

---
 spacy/tokens/doc.pyx                           |  3 ---
 spacy/tokens/span.pyx                          |  2 --
 spacy/tokens/token.pyx                         |  6 +-----
 website/docs/usage/101/_vectors-similarity.mdx | 15 +++++++++------
 4 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 79bb965bb3c..d44e83182f8 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -702,9 +702,6 @@ cdef class Doc:
             elif self.vocab.vectors.size > 0:
                 self._vector = sum(t.vector for t in self) / len(self)
                 return self._vector
-            elif self.tensor.size > 0:
-                self._vector = self.tensor.mean(axis=0)
-                return self._vector
             else:
                 return xp.zeros((self.vocab.vectors_length,), dtype="float32")
 
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 883a67f3dd6..c439c8655dc 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -561,8 +561,6 @@ cdef class Span:
             return self.doc.user_span_hooks["has_vector"](self)
         elif self.vocab.vectors.size > 0:
             return any(token.has_vector for token in self)
-        elif self.doc.tensor.size > 0:
-            return True
         else:
             return False
 
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 3a7ce45c54a..6c4806ff9cb 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -415,9 +415,7 @@ cdef class Token:
         """
         if "has_vector" in self.doc.user_token_hooks:
             return self.doc.user_token_hooks["has_vector"](self)
-        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
-            return True
-        return self.vocab.has_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
+        return self.vocab.has_vector(self.c.lex.orth)
 
     @property
     def vector(self):
@@ -430,8 +428,6 @@ cdef class Token:
         """
         if "vector" in self.doc.user_token_hooks:
             return self.doc.user_token_hooks["vector"](self)
-        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
-            return self.doc.tensor[self.i]
         else:
             return self.vocab.get_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
 
diff --git a/website/docs/usage/101/_vectors-similarity.mdx b/website/docs/usage/101/_vectors-similarity.mdx
index 6deab926d25..39ee8e48a43 100644
--- a/website/docs/usage/101/_vectors-similarity.mdx
+++ b/website/docs/usage/101/_vectors-similarity.mdx
@@ -22,17 +22,20 @@ array([2.02280000e-01,  -7.66180009e-02,   3.70319992e-01,
 <Infobox title="Important note" variant="warning">
 
 To make them compact and fast, spaCy's small [pipeline packages](/models) (all
-packages that end in `sm`) **don't ship with word vectors**, and only include
-context-sensitive **tensors**. This means you can still use the `similarity()`
-methods to compare documents, spans and tokens – but the result won't be as
-good, and individual tokens won't have any vectors assigned. So in order to use
-_real_ word vectors, you need to download a larger pipeline package:
+packages that end in `sm`) **don't ship with word vectors**. In order to use
+`similarity()`, you need to download a larger pipeline package that includes
+vectors:
 
 ```diff
 - python -m spacy download en_core_web_sm
-+ python -m spacy download en_core_web_lg
++ python -m spacy download en_core_web_md
 ```
 
+In spaCy v3 and earlier, small pipeline packages supported `similarity()` by
+backing off to context-sensitive tensors from the `tok2vec` component. These
+tensors do not work well for this purpose and this backoff has been removed in
+spaCy v4.
+
 </Infobox>
 
 Pipeline packages that come with built-in word vectors make them available as

From 92aca18c473a28594d9d7db4df7515f4cca9d6b3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 1 Mar 2023 16:00:02 +0100
Subject: [PATCH 059/504] Return Tuple[Span] for all Doc/Span attrs that
 provide spans (#12288)

* Return Tuple[Span] for all Doc/Span attrs that provide spans

* Update Span types
---
 spacy/tokens/doc.pyi      |  4 ++--
 spacy/tokens/doc.pyx      | 23 +++++++++++------------
 spacy/tokens/span.pyi     |  4 +++-
 spacy/tokens/span.pyx     | 28 ++++++++++++++++------------
 website/docs/api/doc.mdx  | 23 +++++++++++------------
 website/docs/api/span.mdx | 33 ++++++++++++++++-----------------
 6 files changed, 59 insertions(+), 56 deletions(-)

diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 11f8a1c5eb8..2b39d5baa28 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -155,9 +155,9 @@ class Doc:
         default: str = ...,
     ) -> None: ...
     @property
-    def noun_chunks(self) -> Iterator[Span]: ...
+    def noun_chunks(self) -> Tuple[Span]: ...
     @property
-    def sents(self) -> Iterator[Span]: ...
+    def sents(self) -> Tuple[Span]: ...
     @property
     def lang(self) -> int: ...
     @property
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index d44e83182f8..893ba9c2cda 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -748,10 +748,10 @@ cdef class Doc:
         return self.text
 
     property ents:
-        """The named entities in the document. Returns a tuple of named entity
+        """The named entities in the document. Returns a list of named entity
         `Span` objects, if the entity recognizer has been applied.
 
-        RETURNS (tuple): Entities in the document, one `Span` per entity.
+        RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity.
 
         DOCS: https://spacy.io/api/doc#ents
         """
@@ -909,7 +909,7 @@ cdef class Doc:
         NP-level coordination, no prepositional phrases, and no relative
         clauses.
 
-        YIELDS (Span): Noun chunks in the document.
+        RETURNS (Tuple[Span]): Noun chunks in the document.
 
         DOCS: https://spacy.io/api/doc#noun_chunks
         """
@@ -918,36 +918,35 @@ cdef class Doc:
 
         # Accumulate the result before beginning to iterate over it. This
         # prevents the tokenization from being changed out from under us
-        # during the iteration. The tricky thing here is that Span accepts
-        # its tokenization changing, so it's okay once we have the Span
-        # objects. See Issue #375.
+        # during the iteration.
         spans = []
         for start, end, label in self.noun_chunks_iterator(self):
             spans.append(Span(self, start, end, label=label))
-        for span in spans:
-            yield span
+        return tuple(spans)
 
     @property
     def sents(self):
         """Iterate over the sentences in the document. Yields sentence `Span`
         objects. Sentence spans have no label.
 
-        YIELDS (Span): Sentences in the document.
+        RETURNS (Tuple[Span]): Sentences in the document.
 
         DOCS: https://spacy.io/api/doc#sents
         """
         if not self.has_annotation("SENT_START"):
             raise ValueError(Errors.E030)
         if "sents" in self.user_hooks:
-            yield from self.user_hooks["sents"](self)
+            return tuple(self.user_hooks["sents"](self))
         else:
             start = 0
+            spans = []
             for i in range(1, self.length):
                 if self.c[i].sent_start == 1:
-                    yield Span(self, start, i)
+                    spans.append(Span(self, start, i))
                     start = i
             if start != self.length:
-                yield Span(self, start, self.length)
+                spans.append(Span(self, start, self.length))
+            return tuple(spans)
 
     @property
     def lang(self):
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 3c85542bb3d..2a529593e5f 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -81,6 +81,8 @@ class Span:
     @property
     def ents(self) -> Tuple[Span]: ...
     @property
+    def sents(self) -> Tuple[Span]: ...
+    @property
     def has_vector(self) -> bool: ...
     @property
     def vector(self) -> Floats1d: ...
@@ -93,7 +95,7 @@ class Span:
     @property
     def text_with_ws(self) -> str: ...
     @property
-    def noun_chunks(self) -> Iterator[Span]: ...
+    def noun_chunks(self) -> Tuple[Span]: ...
     @property
     def root(self) -> Token: ...
     def char_span(
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index c439c8655dc..1378889c681 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -487,20 +487,21 @@ cdef class Span:
         """Obtain the sentences that contain this span. If the given span
         crosses sentence boundaries, return all sentences it is a part of.
 
-        RETURNS (Iterable[Span]): All sentences that the span is a part of.
+        RETURNS (Tuple[Span]): All sentences that the span is a part of.
 
-         DOCS: https://spacy.io/api/span#sents
+        DOCS: https://spacy.io/api/span#sents
         """
         cdef int start
         cdef int i
 
         if "sents" in self.doc.user_span_hooks:
-            yield from self.doc.user_span_hooks["sents"](self)
-        elif "sents" in self.doc.user_hooks:
+            return tuple(self.doc.user_span_hooks["sents"](self))
+        spans = []
+        if "sents" in self.doc.user_hooks:
             for sentence in self.doc.user_hooks["sents"](self.doc):
                 if sentence.end > self.start:
                     if sentence.start < self.end or sentence.start == self.start == self.end:
-                        yield sentence
+                        spans.append(sentence)
                     else:
                         break
         else:
@@ -515,12 +516,13 @@ cdef class Span:
             # Now, find all the sentences in the span
             for i in range(start + 1, self.doc.length):
                 if self.doc.c[i].sent_start == 1:
-                    yield Span(self.doc, start, i)
+                    spans.append(Span(self.doc, start, i))
                     start = i
                     if start >= self.end:
                         break
-                elif i == self.doc.length - 1:
-                    yield Span(self.doc, start, self.doc.length)
+            if start < self.end:
+                spans.append(Span(self.doc, start, self.end))
+        return tuple(spans)
 
             # Ensure that trailing parts of the Span instance are included in last element of .sents.
             if start == self.doc.length - 1:
@@ -531,7 +533,7 @@ cdef class Span:
         """The named entities that fall completely within the span. Returns
         a tuple of `Span` objects.
 
-        RETURNS (tuple): Entities in the span, one `Span` per entity.
+        RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity.
 
         DOCS: https://spacy.io/api/span#ents
         """
@@ -546,7 +548,7 @@ cdef class Span:
                     ents.append(ent)
                 else:
                     break
-        return ents
+        return tuple(ents)
 
     @property
     def has_vector(self):
@@ -641,13 +643,15 @@ cdef class Span:
         NP-level coordination, no prepositional phrases, and no relative
         clauses.
 
-        YIELDS (Span): Noun chunks in the span.
+        RETURNS (Tuple[Span]): Noun chunks in the span.
 
         DOCS: https://spacy.io/api/span#noun_chunks
         """
+        spans = []
         for span in self.doc.noun_chunks:
             if span.start >= self.start and span.end <= self.end:
-                yield span
+                spans.append(span)
+        return tuple(spans)
 
     @property
     def root(self):
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index f53e209afc8..e92c0e833e0 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -654,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer).
 
 ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
 
-Iterate over the base noun phrases in the document. Yields base noun-phrase
-`Span` objects, if the document has been syntactically parsed. A base noun
-phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
-nested within it – so no NP-level coordination, no prepositional phrases, and no
-relative clauses.
+Returns a tuple of the base noun phrases in the doc, if the document has been
+syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
+does not permit other NPs to be nested within it – so no NP-level coordination,
+no prepositional phrases, and no relative clauses.
 
 To customize the noun chunk iterator in a loaded pipeline, modify
 [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
@@ -675,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised.
 > assert chunks[1].text == "another phrase"
 > ```
 
-| Name       | Description                           |
-| ---------- | ------------------------------------- |
-| **YIELDS** | Noun chunks in the document. ~~Span~~ |
+| Name        | Description                                  |
+| ----------- | -------------------------------------------- |
+| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
 
 ## Doc.sents {id="sents",tag="property",model="sentences"}
 
-Iterate over the sentences in the document. Sentence spans have no label.
+Returns a tuple of the sentences in the document. Sentence spans have no label.
 
 This property is only available when
 [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
@@ -697,9 +696,9 @@ will raise an error otherwise.
 > assert [s.root.text for s in sents] == ["is", "'s"]
 > ```
 
-| Name       | Description                         |
-| ---------- | ----------------------------------- |
-| **YIELDS** | Sentences in the document. ~~Span~~ |
+| Name        | Description                                |
+| ----------- | ------------------------------------------ |
+| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
 
 ## Doc.has_vector {id="has_vector",tag="property",model="vectors"}
 
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index ae7ef7203b6..cd70d8dcead 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -275,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of
 > assert ents[0].text == "Mr. Best"
 > ```
 
-| Name        | Description                                                       |
-| ----------- | ----------------------------------------------------------------- |
-| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
+| Name        | Description                                                  |
+| ----------- | ------------------------------------------------------------ |
+| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ |
 
 ## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"}
 
-Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
-objects, if the document has been syntactically parsed. A base noun phrase, or
-"NP chunk", is a noun phrase that does not permit other NPs to be nested within
-it – so no NP-level coordination, no prepositional phrases, and no relative
-clauses.
+Returns a tuple of the base noun phrases in the span if the document has been
+syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
+does not permit other NPs to be nested within it – so no NP-level coordination,
+no prepositional phrases, and no relative clauses.
 
 If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
 has not been implemeted for the given language, a `NotImplementedError` is
@@ -301,9 +300,9 @@ raised.
 > assert chunks[0].text == "another phrase"
 > ```
 
-| Name       | Description                       |
-| ---------- | --------------------------------- |
-| **YIELDS** | Noun chunks in the span. ~~Span~~ |
+| Name        | Description                              |
+| ----------- | ---------------------------------------- |
+| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ |
 
 ## Span.as_doc {id="as_doc",tag="method"}
 
@@ -525,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)]
 
 ## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"}
 
-Returns a generator over the sentences the span belongs to. This property is
-only available when [sentence boundaries](/usage/linguistic-features#sbd) have
-been set on the document by the `parser`, `senter`, `sentencizer` or some custom
+Returns a tuple of the sentences the span belongs to. This property is only
+available when [sentence boundaries](/usage/linguistic-features#sbd) have been
+set on the document by the `parser`, `senter`, `sentencizer` or some custom
 function. It will raise an error otherwise.
 
 If the span happens to cross sentence boundaries, all sentences the span
@@ -541,9 +540,9 @@ overlaps with will be returned.
 > assert len(span.sents) == 2
 > ```
 
-| Name        | Description                                                                |
-| ----------- | -------------------------------------------------------------------------- |
-| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
+| Name        | Description                                                   |
+| ----------- | ------------------------------------------------------------- |
+| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ |
 
 ## Attributes {id="attributes"}
 

From 7c6d6abfb76603db9c627a63a8a6c440ac6e61d1 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Tue, 7 Mar 2023 13:10:45 +0100
Subject: [PATCH 060/504] Drop support for EntityLinker_v1. (#12377)

---
 spacy/errors.py                            |  1 +
 spacy/pipeline/entity_linker.py            | 23 ++--------------------
 spacy/tests/pipeline/test_entity_linker.py |  7 +------
 3 files changed, 4 insertions(+), 27 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index c8c595395b3..83a1e9ba2c0 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -980,6 +980,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4003 = ("Training examples for distillation must have the exact same tokens in the "
              "reference and predicted docs.")
     E4004 = ("Backprop is not supported when is_train is not set.")
+    E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 21e3a279749..546bd9f6e2a 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -132,28 +132,9 @@ def make_entity_linker(
         prediction is discarded. If None, predictions are not filtered by any threshold.
     save_activations (bool): save model activations in Doc when annotating.
     """
-
     if not model.attrs.get("include_span_maker", False):
-        try:
-            from spacy_legacy.components.entity_linker import EntityLinker_v1
-        except:
-            raise ImportError(
-                "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
-            )
-        # The only difference in arguments here is that use_gold_ents and threshold aren't available.
-        return EntityLinker_v1(
-            nlp.vocab,
-            model,
-            name,
-            labels_discard=labels_discard,
-            n_sents=n_sents,
-            incl_prior=incl_prior,
-            incl_context=incl_context,
-            entity_vector_length=entity_vector_length,
-            get_candidates=get_candidates,
-            overwrite=overwrite,
-            scorer=scorer,
-        )
+        raise ValueError(Errors.E4005)
+
     return EntityLinker(
         nlp.vocab,
         model,
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 9e955f23e43..f28a4c9d5b9 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -980,13 +980,11 @@ def test_scorer_links():
 @pytest.mark.parametrize(
     "name,config",
     [
-        ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
         ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
     ],
 )
 # fmt: on
 def test_legacy_architectures(name, config):
-    from spacy_legacy.components.entity_linker import EntityLinker_v1
 
     # Ensure that the legacy architectures still work
     vector_length = 3
@@ -1009,10 +1007,7 @@ def create_kb(vocab):
         return mykb
 
     entity_linker = nlp.add_pipe(name, config={"model": config})
-    if config["@architectures"] == "spacy.EntityLinker.v1":
-        assert isinstance(entity_linker, EntityLinker_v1)
-    else:
-        assert isinstance(entity_linker, EntityLinker)
+    assert isinstance(entity_linker, EntityLinker)
     entity_linker.set_kb(create_kb)
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
 

From f84e5dfc487795e3720d2ea5b9d6eb31d6fca651 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 9 Mar 2023 09:37:19 +0100
Subject: [PATCH 061/504] `Tok2Vec`: Add `distill` method (#12108)

* `Tok2Vec`: Add `distill` method

* `Tok2Vec`: Refactor `update`

* Add `Tok2Vec.distill` test

* Update `distill` signature to accept `Example`s instead of separate teacher and student docs

* Add docs

* Remove docstring

* Update test

* Remove `update` calls from test

* Update `Tok2Vec.distill` docstring
---
 spacy/pipeline/tok2vec.py            | 125 ++++++++++++++++++++-------
 spacy/tests/pipeline/test_tok2vec.py | 117 +++++++++++++++----------
 website/docs/api/tok2vec.mdx         |  37 ++++++++
 3 files changed, 204 insertions(+), 75 deletions(-)

diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 677f5eec16c..f168aee2ec4 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -1,3 +1,6 @@
+from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple
+from thinc.api import Model, set_dropout_rate, Optimizer, Config
+from thinc.types import Floats2d
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence
 
@@ -158,39 +161,9 @@ def update(
 
         DOCS: https://spacy.io/api/tok2vec#update
         """
-        if losses is None:
-            losses = {}
         validate_examples(examples, "Tok2Vec.update")
         docs = [eg.predicted for eg in examples]
-        set_dropout_rate(self.model, drop)
-        tokvecs, bp_tokvecs = self.model.begin_update(docs)
-        d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
-        losses.setdefault(self.name, 0.0)
-
-        def accumulate_gradient(one_d_tokvecs):
-            """Accumulate tok2vec loss and gradient. This is passed as a callback
-            to all but the last listener. Only the last one does the backprop.
-            """
-            nonlocal d_tokvecs
-            for i in range(len(one_d_tokvecs)):
-                d_tokvecs[i] += one_d_tokvecs[i]
-                losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
-            return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
-
-        def backprop(one_d_tokvecs):
-            """Callback to actually do the backprop. Passed to last listener."""
-            accumulate_gradient(one_d_tokvecs)
-            d_docs = bp_tokvecs(d_tokvecs)
-            if sgd is not None:
-                self.finish_update(sgd)
-            return d_docs
-
-        batch_id = Tok2VecListener.get_batch_id(docs)
-        for listener in self.listeners[:-1]:
-            listener.receive(batch_id, tokvecs, accumulate_gradient)
-        if self.listeners:
-            self.listeners[-1].receive(batch_id, tokvecs, backprop)
-        return losses
+        return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses)
 
     def get_loss(self, examples, scores) -> None:
         pass
@@ -220,6 +193,96 @@ def initialize(
     def add_label(self, label):
         raise NotImplementedError
 
+    def distill(
+        self,
+        teacher_pipe: Optional["TrainablePipe"],
+        examples: Iterable["Example"],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+    ) -> Dict[str, float]:
+        """Performs an update of the student pipe's model using the
+        student's distillation examples and sets the annotations
+        of the teacher's distillation examples using the teacher pipe.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use
+            for prediction.
+        examples (Iterable[Example]): Distillation examples. The reference (teacher)
+            and predicted (student) docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+
+        DOCS: https://spacy.io/api/tok2vec#distill
+        """
+        # By default we require a teacher pipe, but there are downstream
+        # implementations that don't require a pipe.
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        teacher_docs = [eg.reference for eg in examples]
+        student_docs = [eg.predicted for eg in examples]
+        teacher_preds = teacher_pipe.predict(teacher_docs)
+        teacher_pipe.set_annotations(teacher_docs, teacher_preds)
+        return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses)
+
+    def _update_with_docs(
+        self,
+        docs: Iterable[Doc],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+    ):
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        set_dropout_rate(self.model, drop)
+
+        tokvecs, accumulate_gradient, backprop = self._create_backprops(
+            docs, losses, sgd=sgd
+        )
+        batch_id = Tok2VecListener.get_batch_id(docs)
+        for listener in self.listeners[:-1]:
+            listener.receive(batch_id, tokvecs, accumulate_gradient)
+        if self.listeners:
+            self.listeners[-1].receive(batch_id, tokvecs, backprop)
+        return losses
+
+    def _create_backprops(
+        self,
+        docs: Iterable[Doc],
+        losses: Dict[str, float],
+        *,
+        sgd: Optional[Optimizer] = None,
+    ) -> Tuple[Floats2d, Callable, Callable]:
+        tokvecs, bp_tokvecs = self.model.begin_update(docs)
+        d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
+
+        def accumulate_gradient(one_d_tokvecs):
+            """Accumulate tok2vec loss and gradient. This is passed as a callback
+            to all but the last listener. Only the last one does the backprop.
+            """
+            nonlocal d_tokvecs
+            for i in range(len(one_d_tokvecs)):
+                d_tokvecs[i] += one_d_tokvecs[i]
+                losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
+            return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
+
+        def backprop(one_d_tokvecs):
+            """Callback to actually do the backprop. Passed to last listener."""
+            accumulate_gradient(one_d_tokvecs)
+            d_docs = bp_tokvecs(d_tokvecs)
+            if sgd is not None:
+                self.finish_update(sgd)
+            return d_docs
+
+        return tokvecs, accumulate_gradient, backprop
+
 
 class Tok2VecListener(Model):
     """A layer that gets fed its answers from an upstream connection,
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 9648341a106..e557e294112 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -561,55 +561,84 @@ def test_tok2vec_listeners_textcat():
     assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]
 
 
-def test_tok2vec_listener_source_link_name():
-    """The component's internal name and the tok2vec listener map correspond
-    to the most recently modified pipeline.
-    """
-    orig_config = Config().from_str(cfg_string_multi)
-    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
-    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+cfg_string_distillation = """
+    [nlp]
+    lang = "en"
+    pipeline = ["tok2vec","tagger"]
+
+    [components]
+
+    [components.tagger]
+    factory = "tagger"
 
-    nlp2 = English()
-    nlp2.add_pipe("tok2vec", source=nlp1)
-    nlp2.add_pipe("tagger", name="tagger2", source=nlp1)
+    [components.tagger.model]
+    @architectures = "spacy.Tagger.v2"
+    nO = null
+
+    [components.tagger.model.tok2vec]
+    @architectures = "spacy.Tok2VecListener.v1"
+    width = ${components.tok2vec.model.encode.width}
+
+    [components.tok2vec]
+    factory = "tok2vec"
 
-    # there is no way to have the component have the right name for both
-    # pipelines, right now the most recently modified pipeline is prioritized
-    assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2"
+    [components.tok2vec.model]
+    @architectures = "spacy.Tok2Vec.v2"
 
-    # there is no way to have the tok2vec have the right listener map for both
-    # pipelines, right now the most recently modified pipeline is prioritized
-    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
-    nlp2.add_pipe("ner", name="ner3", source=nlp1)
-    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"]
-    nlp2.remove_pipe("ner3")
-    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
-    nlp2.remove_pipe("tagger2")
-    assert nlp2.get_pipe("tok2vec").listening_components == []
+    [components.tok2vec.model.embed]
+    @architectures = "spacy.MultiHashEmbed.v2"
+    width = ${components.tok2vec.model.encode.width}
+    rows = [2000, 1000, 1000, 1000]
+    attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+    include_static_vectors = false
 
-    # at this point the tok2vec component corresponds to nlp2
-    assert nlp1.get_pipe("tok2vec").listening_components == []
+    [components.tok2vec.model.encode]
+    @architectures = "spacy.MaxoutWindowEncoder.v2"
+    width = 96
+    depth = 4
+    window_size = 1
+    maxout_pieces = 3
+    """
 
-    # modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1
-    nlp1.add_pipe("sentencizer")
-    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
 
-    # modifying nlp2 syncs it back to nlp2
-    nlp2.add_pipe("sentencizer")
-    assert nlp1.get_pipe("tok2vec").listening_components == []
+def test_tok2vec_distillation_teacher_annotations():
+    orig_config = Config().from_str(cfg_string_distillation)
+    teacher_nlp = util.load_model_from_config(
+        orig_config, auto_fill=True, validate=True
+    )
+    student_nlp = util.load_model_from_config(
+        orig_config, auto_fill=True, validate=True
+    )
 
+    train_examples_teacher = []
+    train_examples_student = []
+    for t in TRAIN_DATA:
+        train_examples_teacher.append(
+            Example.from_dict(teacher_nlp.make_doc(t[0]), t[1])
+        )
+        train_examples_student.append(
+            Example.from_dict(student_nlp.make_doc(t[0]), t[1])
+        )
 
-def test_tok2vec_listener_source_replace_listeners():
-    orig_config = Config().from_str(cfg_string_multi)
-    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
-    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
-    nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
-    assert nlp1.get_pipe("tok2vec").listening_components == ["ner"]
-
-    nlp2 = English()
-    nlp2.add_pipe("tok2vec", source=nlp1)
-    assert nlp2.get_pipe("tok2vec").listening_components == []
-    nlp2.add_pipe("tagger", source=nlp1)
-    assert nlp2.get_pipe("tok2vec").listening_components == []
-    nlp2.add_pipe("ner", name="ner2", source=nlp1)
-    assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"]
+    optimizer = teacher_nlp.initialize(lambda: train_examples_teacher)
+    student_nlp.initialize(lambda: train_examples_student)
+
+    # Since Language.distill creates a copy of the examples to use as
+    # its internal teacher/student docs, we'll need to monkey-patch the
+    # tok2vec pipe's distill method.
+    student_tok2vec = student_nlp.get_pipe("tok2vec")
+    student_tok2vec._old_distill = student_tok2vec.distill
+
+    def tok2vec_distill_wrapper(
+        self,
+        teacher_pipe,
+        examples,
+        **kwargs,
+    ):
+        assert all(not eg.reference.tensor.any() for eg in examples)
+        out = self._old_distill(teacher_pipe, examples, **kwargs)
+        assert all(eg.reference.tensor.any() for eg in examples)
+        return out
+
+    student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec)
+    student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={})
diff --git a/website/docs/api/tok2vec.mdx b/website/docs/api/tok2vec.mdx
index a1bb1265eae..8b6d2380bae 100644
--- a/website/docs/api/tok2vec.mdx
+++ b/website/docs/api/tok2vec.mdx
@@ -100,6 +100,43 @@ pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## Tok2Vec.distill {id="distill", tag="method,experimental", version="4"}
+
+Performs an update of the student pipe's model using the student's distillation 
+examples and sets the annotations of the teacher's distillation examples using 
+the teacher pipe. 
+
+Unlike other trainable pipes, the student pipe doesn't directly learn its 
+representations from the teacher. However, since downstream pipes that do 
+perform distillation expect the tok2vec annotations to be present on the 
+correct distillation examples, we need to ensure that they are set beforehand.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("tok2vec")
+> student_pipe = student.add_pipe("tok2vec")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to use for prediction. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## Tok2Vec.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood

From bbeddbb7467341c05eddc5d843ae8b318a01fb5c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 19 Mar 2023 23:41:20 +0100
Subject: [PATCH 062/504] Clean up Vocab constructor (#12290)

* Clean up Vocab constructor

* Change effective type of `strings` from `Iterable[str]` to `Optional[StringStore]`
  * Don't automatically add strings to vocab
* Change default values to `None`
* Remove `**deprecated_kwargs`

* Format
---
 spacy/strings.pyi                             |  2 +-
 spacy/tests/pipeline/test_pipe_methods.py     |  3 ++-
 .../serialize/test_serialize_vocab_strings.py | 27 +++++++++++--------
 spacy/tests/vocab_vectors/test_lexeme.py      |  2 +-
 spacy/vocab.pyi                               |  2 +-
 spacy/vocab.pyx                               | 18 +++++++------
 website/docs/api/vocab.mdx                    |  5 ++--
 7 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/spacy/strings.pyi b/spacy/strings.pyi
index 8b7c0d6bd5a..393661f591d 100644
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@@ -3,7 +3,7 @@ from pathlib import Path
 from typing import Any, Iterable, Iterator, Optional, Union, overload
 
 class StringStore:
-    def __init__(self, strings: Optional[Iterable[str]]) -> None: ...
+    def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ...
     @overload
     def __getitem__(self, string_or_hash: str) -> int: ...
     @overload
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 9b9786f0458..39611a74278 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -9,6 +9,7 @@
 from spacy.lang.en.syntax_iterators import noun_chunks
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
+from spacy.strings import StringStore
 from spacy.tokens import Doc
 from spacy.training import Example
 from spacy.util import SimpleFrozenList, get_arg_names, make_tempdir
@@ -131,7 +132,7 @@ def test_issue5458():
     # Test that the noun chuncker does not generate overlapping spans
     # fmt: off
     words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
-    vocab = Vocab(strings=words)
+    vocab = Vocab(strings=StringStore(words))
     deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
     pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
     heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py
index fd80c3d8e4f..f6356ac9e01 100644
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@@ -13,8 +13,11 @@
 
 from ..util import make_tempdir
 
-test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
-test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
+test_strings = [
+    (StringStore(), StringStore()),
+    (StringStore(["rats", "are", "cute"]), StringStore(["i", "like", "rats"])),
+]
+test_strings_attrs = [(StringStore(["rats", "are", "cute"]), "Hello")]
 
 
 @pytest.mark.issue(599)
@@ -81,7 +84,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
     vocab2 = Vocab(strings=strings2)
     vocab1_b = vocab1.to_bytes()
     vocab2_b = vocab2.to_bytes()
-    if strings1 == strings2:
+    if strings1.to_bytes() == strings2.to_bytes():
         assert vocab1_b == vocab2_b
     else:
         assert vocab1_b != vocab2_b
@@ -117,11 +120,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
 def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
     vocab1 = Vocab(strings=strings)
     vocab2 = Vocab()
-    vocab1[strings[0]].norm_ = lex_attr
-    assert vocab1[strings[0]].norm_ == lex_attr
-    assert vocab2[strings[0]].norm_ != lex_attr
+    s = next(iter(vocab1.strings))
+    vocab1[s].norm_ = lex_attr
+    assert vocab1[s].norm_ == lex_attr
+    assert vocab2[s].norm_ != lex_attr
     vocab2 = vocab2.from_bytes(vocab1.to_bytes())
-    assert vocab2[strings[0]].norm_ == lex_attr
+    assert vocab2[s].norm_ == lex_attr
 
 
 @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@@ -136,14 +140,15 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
 def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
     vocab1 = Vocab(strings=strings)
     vocab2 = Vocab()
-    vocab1[strings[0]].norm_ = lex_attr
-    assert vocab1[strings[0]].norm_ == lex_attr
-    assert vocab2[strings[0]].norm_ != lex_attr
+    s = next(iter(vocab1.strings))
+    vocab1[s].norm_ = lex_attr
+    assert vocab1[s].norm_ == lex_attr
+    assert vocab2[s].norm_ != lex_attr
     with make_tempdir() as d:
         file_path = d / "vocab"
         vocab1.to_disk(file_path)
         vocab2 = vocab2.from_disk(file_path)
-    assert vocab2[strings[0]].norm_ == lex_attr
+    assert vocab2[s].norm_ == lex_attr
 
 
 @pytest.mark.parametrize("strings1,strings2", test_strings)
diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py
index 156e3391aa2..dc2c80bcdd0 100644
--- a/spacy/tests/vocab_vectors/test_lexeme.py
+++ b/spacy/tests/vocab_vectors/test_lexeme.py
@@ -18,7 +18,7 @@ def test_issue361(en_vocab, text1, text2):
 
 @pytest.mark.issue(600)
 def test_issue600():
-    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
+    vocab = Vocab()
     doc = Doc(vocab, words=["hello"])
     doc[0].tag_ = "NN"
 
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
index 7fbb9764f10..beb7febee63 100644
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@@ -27,7 +27,7 @@ class Vocab:
     def __init__(
         self,
         lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ...,
-        strings: Optional[Union[List[str], StringStore]] = ...,
+        strings: Optional[StringStore] = ...,
         lookups: Optional[Lookups] = ...,
         oov_prob: float = ...,
         writing_system: Dict[str, Any] = ...,
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 3145f51844a..3ccfa6db622 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -46,9 +46,8 @@ cdef class Vocab:
 
     DOCS: https://spacy.io/api/vocab
     """
-    def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
-                 oov_prob=-20., writing_system={}, get_noun_chunks=None,
-                 **deprecated_kwargs):
+    def __init__(self, lex_attr_getters=None, strings=None, lookups=None,
+            oov_prob=-20., writing_system=None, get_noun_chunks=None):
         """Create the vocabulary.
 
         lex_attr_getters (dict): A dictionary mapping attribute IDs to
@@ -66,16 +65,19 @@ cdef class Vocab:
         self.cfg = {'oov_prob': oov_prob}
         self.mem = Pool()
         self._by_orth = PreshMap()
-        self.strings = StringStore()
         self.length = 0
-        if strings:
-            for string in strings:
-                _ = self[string]
+        if strings is None:
+            self.strings = StringStore()
+        else:
+            self.strings = strings
         self.lex_attr_getters = lex_attr_getters
         self.morphology = Morphology(self.strings)
         self.vectors = Vectors(strings=self.strings)
         self.lookups = lookups
-        self.writing_system = writing_system
+        if writing_system is None:
+            self.writing_system = {}
+        else:
+            self.writing_system = writing_system
         self.get_noun_chunks = get_noun_chunks
 
     property vectors:
diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx
index 36369c78427..88d3939142f 100644
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@@ -24,14 +24,15 @@ Create the vocabulary.
 > #### Example
 >
 > ```python
+> from spacy.strings import StringStore
 > from spacy.vocab import Vocab
-> vocab = Vocab(strings=["hello", "world"])
+> vocab = Vocab(strings=StringStore(["hello", "world"]))
 > ```
 
 | Name               | Description                                                                                                                                                             |
 | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~                                      |
-| `strings`          | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~                           |
+| `strings`          | A [`StringStore`](/api/stringstore) that maps strings to hash values. ~~Optional[StringStore]~~                                                                         |
 | `lookups`          | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~                                      |
 | `oov_prob`         | The default OOV probability. Defaults to `-20.0`. ~~float~~                                                                                                             |
 | `writing_system`   | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~                          |

From e47e01001deddb569a2910131e6b96ed6cbb676d Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 20 Mar 2023 00:34:35 +0100
Subject: [PATCH 063/504] Introduce hierarchy for EL `Candidate` objects
 (#12341)

* Convert Candidate from Cython to Python class.

* Format.

* Fix .entity_ typo in _add_activations() usage.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update doc string of BaseCandidate.__init__().

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate.

* Adjust Candidate to support and mandate numerical entity IDs.

* Format.

* Fix docstring and docs.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename alias -> mention.

* Refactor Candidate attribute names. Update docs and tests accordingly.

* Refacor Candidate attributes and their usage.

* Format.

* Fix mypy error.

* Update error code in line with v4 convention.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Updated error code.

* Simplify interface for int/str representations.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename 'alias' to 'mention'.

* Port Candidate and InMemoryCandidate to Cython.

* Remove redundant entry in setup.py.

* Add abstract class check.

* Drop storing mention.

* Update spacy/kb/candidate.pxd

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fix entity_id refactoring problems in docstrings.

* Drop unused InMemoryCandidate._entity_hash.

* Update docstrings.

* Move attributes out of Candidate.

* Partially fix alias/mention terminology usage. Convert Candidate to interface.

* Remove prior_prob from supported properties in Candidate. Introduce KnowledgeBase.supports_prior_probs().

* Update docstrings related to prior_prob.

* Update alias/mention usage in doc(strings).

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Mention -> alias renaming. Drop Candidate.mentions(). Drop InMemoryLookupKB.get_alias_candidates() from docs.

* Update docstrings.

* Fix InMemoryCandidate attribute names.

* Update spacy/kb/kb.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update W401 test.

* Update spacy/errors.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/kb/kb.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Use Candidate output type for toy generators in the test suite to mimick best practices

* fix docs

* fix import

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/errors.py                            |   8 +-
 spacy/kb/__init__.py                       |  10 +-
 spacy/kb/candidate.pxd                     |  22 ++--
 spacy/kb/candidate.pyx                     | 135 +++++++++++----------
 spacy/kb/kb.pyx                            |  23 ++--
 spacy/kb/kb_in_memory.pyx                  |  36 +++---
 spacy/ml/models/entity_linker.py           |  28 +++++
 spacy/pipeline/entity_linker.py            |  61 ++++++++--
 spacy/tests/pipeline/test_entity_linker.py |  48 ++++----
 spacy/tests/serialize/test_serialize_kb.py |  12 +-
 website/docs/api/inmemorylookupkb.mdx      |  40 ++----
 website/docs/api/kb.mdx                    |  51 +++-----
 12 files changed, 263 insertions(+), 211 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 83a1e9ba2c0..42fdc12e029 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes):
             "ignoring the duplicate entry.")
     W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
             "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
-    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
+    W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
             "the Knowledge Base.")
     W026 = ("Unable to set all sentence boundaries from dependency parses. If "
             "you are constructing a parse tree incrementally by setting "
@@ -214,7 +214,11 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
+    # v4 warning strings
     W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
+    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
+            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
+            "to return `True` in `.supports_prior_probs`.")
 
 
 class Errors(metaclass=ErrorsWithCodes):
@@ -981,6 +985,8 @@ class Errors(metaclass=ErrorsWithCodes):
              "reference and predicted docs.")
     E4004 = ("Backprop is not supported when is_train is not set.")
     E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
+    E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
+
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
index 93a65ab6194..fb21083ddee 100644
--- a/spacy/kb/__init__.py
+++ b/spacy/kb/__init__.py
@@ -1,11 +1,7 @@
 from .candidate import Candidate, get_candidates, get_candidates_batch
 from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB
+from .candidate import Candidate, InMemoryCandidate
 
-__all__ = [
-    "Candidate",
-    "KnowledgeBase",
-    "InMemoryLookupKB",
-    "get_candidates",
-    "get_candidates_batch",
-]
+
+__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd
index 80fcbc45940..f21f423e496 100644
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@@ -1,15 +1,15 @@
 from libcpp.vector cimport vector
-
+from .kb_in_memory cimport InMemoryLookupKB
 from ..typedefs cimport hash_t
-from .kb cimport KnowledgeBase
-
 
-# Object used by the Entity Linker that summarizes one entity-alias candidate
-# combination.
 cdef class Candidate:
-    cdef readonly KnowledgeBase kb
-    cdef hash_t entity_hash
-    cdef float entity_freq
-    cdef vector[float] entity_vector
-    cdef hash_t alias_hash
-    cdef float prior_prob
+    pass
+
+
+cdef class InMemoryCandidate(Candidate):
+    cdef readonly hash_t _entity_hash
+    cdef readonly hash_t _alias_hash
+    cpdef vector[float] _entity_vector
+    cdef float _prior_prob
+    cdef readonly InMemoryLookupKB _kb
+    cdef float _entity_freq
diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx
index 4369676e23a..bf66ccfae67 100644
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@@ -1,90 +1,97 @@
 # cython: infer_types=True
 
-from typing import Iterable
-
-from .kb cimport KnowledgeBase
-
-from ..tokens import Span
+from .kb_in_memory cimport InMemoryLookupKB
+from ..errors import Errors
 
 
 cdef class Candidate:
-    """A `Candidate` object refers to a textual mention (`alias`) that may or
-    may not be resolved to a specific `entity` from a Knowledge Base. This
-    will be used as input for the entity linking algorithm which will
-    disambiguate the various candidates to the correct one.
-    Each candidate (alias, entity) pair is assigned a certain prior probability.
+    """A `Candidate` object refers to a textual mention that may or may not be resolved
+    to a specific entity from a Knowledge Base. This will be used as input for the entity linking
+    algorithm which will disambiguate the various candidates to the correct one.
+    Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
+    is assigned a certain prior probability.
 
     DOCS: https://spacy.io/api/kb/#candidate-init
     """
 
-    def __init__(
-        self,
-        KnowledgeBase kb,
-        entity_hash,
-        entity_freq,
-        entity_vector,
-        alias_hash,
-        prior_prob
-    ):
-        self.kb = kb
-        self.entity_hash = entity_hash
-        self.entity_freq = entity_freq
-        self.entity_vector = entity_vector
-        self.alias_hash = alias_hash
-        self.prior_prob = prior_prob
+    def __init__(self):
+        # Make sure abstract Candidate is not instantiated.
+        if self.__class__ == Candidate:
+            raise TypeError(
+                Errors.E1046.format(cls_name=self.__class__.__name__)
+            )
 
     @property
-    def entity(self) -> int:
-        """RETURNS (uint64): hash of the entity's KB ID/name"""
-        return self.entity_hash
+    def entity_id(self) -> int:
+        """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
+        otherwise the hash of the entity ID string)."""
+        raise NotImplementedError
 
     @property
-    def entity_(self) -> str:
-        """RETURNS (str): ID/name of this entity in the KB"""
-        return self.kb.vocab.strings[self.entity_hash]
+    def entity_id_(self) -> str:
+        """RETURNS (str): String representation of entity ID."""
+        raise NotImplementedError
 
     @property
-    def alias(self) -> int:
-        """RETURNS (uint64): hash of the alias"""
-        return self.alias_hash
+    def entity_vector(self) -> vector[float]:
+        """RETURNS (vector[float]): Entity vector."""
+        raise NotImplementedError
 
-    @property
-    def alias_(self) -> str:
-        """RETURNS (str): ID of the original alias"""
-        return self.kb.vocab.strings[self.alias_hash]
+
+cdef class InMemoryCandidate(Candidate):
+    """Candidate for InMemoryLookupKB."""
+
+    def __init__(
+        self,
+        kb: InMemoryLookupKB,
+        entity_hash: int,
+        alias_hash: int,
+        entity_vector: vector[float],
+        prior_prob: float,
+        entity_freq: float
+    ):
+        """
+        kb (InMemoryLookupKB]): InMemoryLookupKB instance.
+        entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
+        entity_freq (int): Entity frequency in KB corpus.
+        entity_vector (List[float]): Entity embedding.
+        alias_hash (int): Alias hash.
+        prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
+            the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
+        """
+        super().__init__()
+
+        self._entity_hash = entity_hash
+        self._entity_vector = entity_vector
+        self._prior_prob = prior_prob
+        self._kb = kb
+        self._alias_hash = alias_hash
+        self._entity_freq = entity_freq
 
     @property
-    def entity_freq(self) -> float:
-        return self.entity_freq
+    def entity_id(self) -> int:
+        return self._entity_hash
 
     @property
-    def entity_vector(self) -> Iterable[float]:
-        return self.entity_vector
+    def entity_vector(self) -> vector[float]:
+        return self._entity_vector
 
     @property
     def prior_prob(self) -> float:
-        return self.prior_prob
-
+        """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
+        this entity."""
+        return self._prior_prob
 
-def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
-    """
-    Return candidate entities for a given mention and fetching appropriate
-    entries from the index.
-    kb (KnowledgeBase): Knowledge base to query.
-    mention (Span): Entity mention for which to identify candidates.
-    RETURNS (Iterable[Candidate]): Identified candidates.
-    """
-    return kb.get_candidates(mention)
+    @property
+    def alias(self) -> str:
+        """RETURNS (str): Alias."""
+        return self._kb.vocab.strings[self._alias_hash]
 
+    @property
+    def entity_id_(self) -> str:
+        return self._kb.vocab.strings[self._entity_hash]
 
-def get_candidates_batch(
-        kb: KnowledgeBase, mentions: Iterable[Span]
-) -> Iterable[Iterable[Candidate]]:
-    """
-    Return candidate entities for the given mentions and fetching appropriate entries
-    from the index.
-    kb (KnowledgeBase): Knowledge base to query.
-    mention (Iterable[Span]): Entity mentions for which to identify candidates.
-    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
-    """
-    return kb.get_candidates_batch(mentions)
+    @property
+    def entity_freq(self) -> float:
+        """RETURNS (float): Entity frequency in KB corpus."""
+        return self._entity_freq
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index c7db34e166a..24cee30ffc7 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -36,10 +36,10 @@ cdef class KnowledgeBase:
         self, mentions: Iterable[Span]
     ) -> Iterable[Iterable[Candidate]]:
         """
-        Return candidate entities for specified texts. Each candidate defines
-        the entity, the original alias, and the prior probability of that
-        alias resolving to that entity.
-        If no candidate is found for a given text, an empty list is returned.
+        Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
+        entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
+        probability of the specified mention text resolving to that entity - might be included.
+        If no candidates are found for a given mention, an empty list is returned.
         mentions (Iterable[Span]): Mentions for which to get candidates.
         RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
         """
@@ -47,10 +47,10 @@ cdef class KnowledgeBase:
 
     def get_candidates(self, mention: Span) -> Iterable[Candidate]:
         """
-        Return candidate entities for specified text. Each candidate defines
-        the entity, the original alias,
-        and the prior probability of that alias resolving to that entity.
-        If the no candidate is found for a given text, an empty list is returned.
+        Return candidate entities for a specific mention. Each candidate defines at least the entity and the
+        entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
+        probability of the specified mention text resolving to that entity - might be included.
+        If no candidate is found for the given mention, an empty list is returned.
         mention (Span): Mention for which to get candidates.
         RETURNS (Iterable[Candidate]): Identified candidates.
         """
@@ -128,3 +128,10 @@ cdef class KnowledgeBase:
                 parent="KnowledgeBase", method="from_disk", name=self.__name__
             )
         )
+
+    @property
+    def supports_prior_probs(self) -> bool:
+        """RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
+        )
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index 2b21f246a54..3aab0d73e72 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -22,8 +22,7 @@ from ..util import SimpleFrozenList, ensure_path
 
 from ..vocab cimport Vocab
 from .kb cimport KnowledgeBase
-
-from .candidate import Candidate as Candidate
+from .candidate import InMemoryCandidate
 
 
 cdef class InMemoryLookupKB(KnowledgeBase):
@@ -255,10 +254,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
             alias_entry.probs = probs
             self._aliases_table[alias_index] = alias_entry
 
-    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
-        return self.get_alias_candidates(mention.text)  # type: ignore
+    def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
+        return self._get_alias_candidates(mention.text)  # type: ignore
 
-    def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
+    def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
         """
         Return candidate entities for an alias. Each candidate defines the
         entity, the original alias, and the prior probability of that alias
@@ -271,18 +270,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
 
-        return [Candidate(kb=self,
-                          entity_hash=self._entries[entry_index].entity_hash,
-                          entity_freq=self._entries[entry_index].freq,
-                          entity_vector=self._vectors_table[
-                              self._entries[entry_index].vector_index
-                          ],
-                          alias_hash=alias_hash,
-                          prior_prob=prior_prob)
-                for (entry_index, prior_prob) in zip(
-                    alias_entry.entry_indices, alias_entry.probs
-                )
-                if entry_index != 0]
+        return [
+            InMemoryCandidate(
+                kb=self,
+                entity_hash=self._entries[entry_index].entity_hash,
+                alias_hash=alias_hash,
+                entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
+                prior_prob=prior_prob,
+                entity_freq=self._entries[entry_index].freq
+            )
+            for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
+            if entry_index != 0
+        ]
 
     def get_vector(self, str entity):
         cdef hash_t entity_hash = self.vocab.strings[entity]
@@ -316,6 +315,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 
         return 0.0
 
+    def supports_prior_probs(self) -> bool:
+        return True
+
     def to_bytes(self, **kwargs):
         """Serialize the current state to a binary string.
         """
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index b7100c00a4b..99522c4617c 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -14,6 +14,12 @@
 )
 from thinc.types import Floats2d
 
+from ...util import registry
+from ...kb import KnowledgeBase, InMemoryLookupKB
+from ...kb import Candidate
+from ...vocab import Vocab
+from ...tokens import Span, Doc
+from ..extract_spans import extract_spans
 from ...errors import Errors
 from ...kb import (
     Candidate,
@@ -132,3 +138,25 @@ def create_candidates_batch() -> Callable[
     [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
 ]:
     return get_candidates_batch
+
+
+def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
+    """
+    Return candidate entities for a given mention and fetching appropriate entries from the index.
+    kb (KnowledgeBase): Knowledge base to query.
+    mention (Span): Entity mention for which to identify candidates.
+    RETURNS (Iterable[Candidate]): Identified candidates.
+    """
+    return kb.get_candidates(mention)
+
+
+def get_candidates_batch(
+    kb: KnowledgeBase, mentions: Iterable[Span]
+) -> Iterable[Iterable[Candidate]]:
+    """
+    Return candidate entities for the given mentions and fetching appropriate entries from the index.
+    kb (KnowledgeBase): Knowledge base to query.
+    mentions (Iterable[Span]): Entity mentions for which to identify candidates.
+    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
+    """
+    return kb.get_candidates_batch(mentions)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 546bd9f6e2a..410da58a46d 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,5 +1,5 @@
-from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
-from typing import cast
+import warnings
+from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast
 from numpy import dtype
 from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 from pathlib import Path
@@ -15,14 +15,13 @@
 from thinc.types import Floats2d
 
 from ..kb import KnowledgeBase, Candidate
-from ..ml import empty_kb
 from ..tokens import Doc, Span
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 from ..language import Language
 from ..vocab import Vocab
 from ..training import Example, validate_examples, validate_get_examples
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..util import SimpleFrozenList, registry
 from .. import util
 from ..errors import Errors
@@ -252,6 +251,8 @@ def __init__(
 
         if candidates_batch_size < 1:
             raise ValueError(Errors.E1044)
+        if self.incl_prior and not self.kb.supports_prior_probs:
+            warnings.warn(Warnings.W401)
 
     def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
         """Define the KB of this pipe by providing a function that will
@@ -524,17 +525,51 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                             entity_encodings = xp.asarray(
                                 [c.entity_vector for c in candidates]
                             )
-                            entity_norm = xp.linalg.norm(entity_encodings, axis=1)
-                            if len(entity_encodings) != len(prior_probs):
-                                raise RuntimeError(
-                                    Errors.E147.format(
-                                        method="predict",
-                                        msg="vectors not of equal length",
+                        elif len(candidates) == 1 and self.threshold is None:
+                            # shortcut for efficiency reasons: take the 1 candidate
+                            final_kb_ids.append(candidates[0].entity_id_)
+                            self._add_activations(
+                                doc_scores=doc_scores,
+                                doc_ents=doc_ents,
+                                scores=[1.0],
+                                ents=[candidates[0].entity_id],
+                            )
+                        else:
+                            random.shuffle(candidates)
+                            # set all prior probabilities to 0 if incl_prior=False
+                            if self.incl_prior and self.kb.supports_prior_probs:
+                                prior_probs = xp.asarray([c.prior_prob for c in candidates])  # type: ignore
+                            else:
+                                prior_probs = xp.asarray([0.0 for _ in candidates])
+                            scores = prior_probs
+                            # add in similarity from the context
+                            if self.incl_context:
+                                entity_encodings = xp.asarray(
+                                    [c.entity_vector for c in candidates]
+                                )
+                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+                                if len(entity_encodings) != len(prior_probs):
+                                    raise RuntimeError(
+                                        Errors.E147.format(
+                                            method="predict",
+                                            msg="vectors not of equal length",
+                                        )
                                     )
                                 )
-                            # cosine similarity
-                            sims = xp.dot(entity_encodings, sentence_encoding_t) / (
-                                sentence_norm * entity_norm
+                                if sims.shape != prior_probs.shape:
+                                    raise ValueError(Errors.E161)
+                                scores = prior_probs + sims - (prior_probs * sims)
+                            final_kb_ids.append(
+                                candidates[scores.argmax().item()].entity_id_
+                                if self.threshold is None
+                                or scores.max() >= self.threshold
+                                else EntityLinker.NIL
+                            )
+                            self._add_activations(
+                                doc_scores=doc_scores,
+                                doc_ents=doc_ents,
+                                scores=scores,
+                                ents=[c.entity_id for c in candidates],
                             )
                             if sims.shape != prior_probs.shape:
                                 raise ValueError(Errors.E161)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index f28a4c9d5b9..4ba7cc1a16d 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -7,9 +7,10 @@
 from spacy import Language, registry, util
 from spacy.attrs import ENT_KB_ID
 from spacy.compat import pickle
-from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates
+from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
 from spacy.lang.en import English
 from spacy.ml import load_kb
+from spacy.ml.models.entity_linker import build_span_maker, get_candidates
 from spacy.pipeline import EntityLinker, TrainablePipe
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
@@ -452,16 +453,17 @@ def test_candidate_generation(nlp):
     mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
     # test the size of the relevant candidates
+    adam_ent_cands = get_candidates(mykb, adam_ent)
     assert len(get_candidates(mykb, douglas_ent)) == 2
-    assert len(get_candidates(mykb, adam_ent)) == 1
+    assert len(adam_ent_cands) == 1
     assert len(get_candidates(mykb, Adam_ent)) == 0  # default case sensitive
     assert len(get_candidates(mykb, shrubbery_ent)) == 0
 
     # test the content of the candidates
-    assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2"
-    assert get_candidates(mykb, adam_ent)[0].alias_ == "adam"
-    assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12)
-    assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
+    assert adam_ent_cands[0].entity_id_ == "Q2"
+    assert adam_ent_cands[0].alias == "adam"
+    assert_almost_equal(adam_ent_cands[0].entity_freq, 12)
+    assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9)
 
 
 def test_el_pipe_configuration(nlp):
@@ -489,7 +491,7 @@ def create_kb(vocab):
     assert doc[2].ent_kb_id_ == "Q2"
 
     def get_lowercased_candidates(kb, span):
-        return kb.get_alias_candidates(span.text.lower())
+        return kb._get_alias_candidates(span.text.lower())
 
     def get_lowercased_candidates_batch(kb, spans):
         return [get_lowercased_candidates(kb, span) for span in spans]
@@ -548,24 +550,22 @@ def test_vocab_serialization(nlp):
     mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
     adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
-    candidates = mykb.get_alias_candidates("adam")
+    candidates = mykb._get_alias_candidates("adam")
     assert len(candidates) == 1
-    assert candidates[0].entity == q2_hash
-    assert candidates[0].entity_ == "Q2"
-    assert candidates[0].alias == adam_hash
-    assert candidates[0].alias_ == "adam"
+    assert candidates[0].entity_id == q2_hash
+    assert candidates[0].entity_id_ == "Q2"
+    assert candidates[0].alias == "adam"
 
     with make_tempdir() as d:
         mykb.to_disk(d / "kb")
         kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
         kb_new_vocab.from_disk(d / "kb")
 
-        candidates = kb_new_vocab.get_alias_candidates("adam")
+        candidates = kb_new_vocab._get_alias_candidates("adam")
         assert len(candidates) == 1
-        assert candidates[0].entity == q2_hash
-        assert candidates[0].entity_ == "Q2"
-        assert candidates[0].alias == adam_hash
-        assert candidates[0].alias_ == "adam"
+        assert candidates[0].entity_id == q2_hash
+        assert candidates[0].entity_id_ == "Q2"
+        assert candidates[0].alias == "adam"
 
         assert kb_new_vocab.get_vector("Q2") == [2]
         assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
@@ -585,20 +585,20 @@ def test_append_alias(nlp):
     mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
     # test the size of the relevant candidates
-    assert len(mykb.get_alias_candidates("douglas")) == 2
+    assert len(mykb._get_alias_candidates("douglas")) == 2
 
     # append an alias
     mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
 
     # test the size of the relevant candidates has been incremented
-    assert len(mykb.get_alias_candidates("douglas")) == 3
+    assert len(mykb._get_alias_candidates("douglas")) == 3
 
     # append the same alias-entity pair again should not work (will throw a warning)
     with pytest.warns(UserWarning):
         mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
 
     # test the size of the relevant candidates remained unchanged
-    assert len(mykb.get_alias_candidates("douglas")) == 3
+    assert len(mykb._get_alias_candidates("douglas")) == 3
 
 
 @pytest.mark.filterwarnings("ignore:\\[W036")
@@ -895,11 +895,11 @@ def test_kb_to_bytes():
     assert kb_2.contains_alias("Russ Cochran")
     assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
     assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
-    assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
-        kb_2.get_alias_candidates("Russ Cochran")
+    assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
+        kb_2._get_alias_candidates("Russ Cochran")
     )
-    assert len(kb_1.get_alias_candidates("Randomness")) == len(
-        kb_2.get_alias_candidates("Randomness")
+    assert len(kb_1._get_alias_candidates("Randomness")) == len(
+        kb_2._get_alias_candidates("Randomness")
     )
 
 
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 99eb8cd8694..b6bad3c46ee 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -66,19 +66,21 @@ def _check_kb(kb):
         assert alias_string not in kb.get_alias_strings()
 
     # check candidates & probabilities
-    candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
+    candidates = sorted(
+        kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_
+    )
     assert len(candidates) == 2
 
-    assert candidates[0].entity_ == "Q007"
+    assert candidates[0].entity_id_ == "Q007"
     assert 6.999 < candidates[0].entity_freq < 7.01
     assert candidates[0].entity_vector == [0, 0, 7]
-    assert candidates[0].alias_ == "double07"
+    assert candidates[0].alias == "double07"
     assert 0.899 < candidates[0].prior_prob < 0.901
 
-    assert candidates[1].entity_ == "Q17"
+    assert candidates[1].entity_id_ == "Q17"
     assert 1.99 < candidates[1].entity_freq < 2.01
     assert candidates[1].entity_vector == [7, 1, 0]
-    assert candidates[1].alias_ == "double07"
+    assert candidates[1].alias == "double07"
     assert 0.099 < candidates[1].prior_prob < 0.101
 
 
diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx
index 15b1d3bf29c..4621d883810 100644
--- a/website/docs/api/inmemorylookupkb.mdx
+++ b/website/docs/api/inmemorylookupkb.mdx
@@ -10,9 +10,9 @@ version: 3.5
 
 The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
 implements all of its methods. It stores all KB data in-memory and generates
-[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with
-entity names. It's highly optimized for both a low memory footprint and speed of
-retrieval.
+[`InMemoryCandidate`](/api/kb#candidate) objects by exactly matching mentions
+with entity names. It's highly optimized for both a low memory footprint and
+speed of retrieval.
 
 ## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"}
 
@@ -156,7 +156,7 @@ Get a list of all aliases in the knowledge base.
 ## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"}
 
 Given a certain textual mention as input, retrieve a list of candidate entities
-of type [`Candidate`](/api/kb#candidate). Wraps
+of type [`InMemoryCandidate`](/api/kb#candidate). Wraps
 [`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
 
 > #### Example
@@ -168,10 +168,10 @@ of type [`Candidate`](/api/kb#candidate). Wraps
 > candidates = kb.get_candidates(doc[0:2])
 > ```
 
-| Name        | Description                                                          |
-| ----------- | -------------------------------------------------------------------- |
-| `mention`   | The textual mention or alias. ~~Span~~                               |
-| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
+| Name        | Description                                                                          |
+| ----------- | ------------------------------------------------------------------------------------ |
+| `mention`   | The textual mention or alias. ~~Span~~                                               |
+| **RETURNS** | An iterable of relevant `InMemoryCandidate` objects. ~~Iterable[InMemoryCandidate]~~ |
 
 ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
 
@@ -194,26 +194,10 @@ to you.
 > candidates = kb.get_candidates((doc[0:2], doc[3:]))
 > ```
 
-| Name        | Description                                                                                  |
-| ----------- | -------------------------------------------------------------------------------------------- |
-| `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
-| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
-
-## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"}
-
-Given a certain textual mention as input, retrieve a list of candidate entities
-of type [`Candidate`](/api/kb#candidate).
-
-> #### Example
->
-> ```python
-> candidates = kb.get_alias_candidates("Douglas")
-> ```
-
-| Name        | Description                                                   |
-| ----------- | ------------------------------------------------------------- |
-| `alias`     | The textual mention or alias. ~~str~~                         |
-| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
+| Name        | Description                                                                                                  |
+| ----------- | ------------------------------------------------------------------------------------------------------------ |
+| `mentions`  | The textual mentions. ~~Iterable[Span]~~                                                                     |
+| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
 
 ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
 
diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx
index 2b0d4d9d6b3..9536a3fe375 100644
--- a/website/docs/api/kb.mdx
+++ b/website/docs/api/kb.mdx
@@ -103,23 +103,6 @@ to you.
 | `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
 | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
 
-## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"}
-
-<Infobox variant="warning">
-  This method is _not_ available from spaCy 3.5 onwards.
-</Infobox>
-
-From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
-[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
-allow more flexibility in customizing knowledge bases. Some of its methods were
-moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
-one of those being `get_alias_candidates()`. This method is now available as
-[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
-Note:
-[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
-defaults to
-[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
-
 ## KnowledgeBase.get_vector {id="get_vector",tag="method"}
 
 Given a certain entity ID, retrieve its pretrained entity vector.
@@ -190,25 +173,27 @@ Restore the state of the knowledge base from a given directory. Note that the
 | `exclude`   | List of components to exclude. ~~Iterable[str]~~                                                |
 | **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~                                          |
 
-## Candidate {id="candidate",tag="class"}
+## InMemoryCandidate {id="candidate",tag="class"}
 
-A `Candidate` object refers to a textual mention (alias) that may or may not be
-resolved to a specific entity from a `KnowledgeBase`. This will be used as input
-for the entity linking algorithm which will disambiguate the various candidates
-to the correct one. Each candidate `(alias, entity)` pair is assigned to a
-certain prior probability.
+An `InMemoryCandidate` object refers to a textual mention (alias) that may or
+may not be resolved to a specific entity from a `KnowledgeBase`. This will be
+used as input for the entity linking algorithm which will disambiguate the
+various candidates to the correct one. Each candidate `(alias, entity)` pair is
+assigned to a certain prior probability.
 
-### Candidate.\_\_init\_\_ {id="candidate-init",tag="method"}
+### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
 
-Construct a `Candidate` object. Usually this constructor is not called directly,
-but instead these objects are returned by the `get_candidates` method of the
-[`entity_linker`](/api/entitylinker) pipe.
+Construct an `InMemoryCandidate` object. Usually this constructor is not called
+directly, but instead these objects are returned by the `get_candidates` method
+of the [`entity_linker`](/api/entitylinker) pipe.
 
-> #### Example
+> #### Example```python
+>
+> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
+> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
+>
+> ```
 >
-> ```python
-> from spacy.kb import Candidate
-> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
 > ```
 
 | Name          | Description                                                               |
@@ -216,10 +201,10 @@ but instead these objects are returned by the `get_candidates` method of the
 | `kb`          | The knowledge base that defined this candidate. ~~KnowledgeBase~~         |
 | `entity_hash` | The hash of the entity's KB ID. ~~int~~                                   |
 | `entity_freq` | The entity frequency as recorded in the KB. ~~float~~                     |
-| `alias_hash`  | The hash of the textual mention or alias. ~~int~~                         |
+| `alias_hash`  | The hash of the entity alias. ~~int~~                                     |
 | `prior_prob`  | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
 
-## Candidate attributes {id="candidate-attributes"}
+## InMemoryCandidate attributes {id="candidate-attributes"}
 
 | Name            | Description                                                              |
 | --------------- | ------------------------------------------------------------------------ |

From f0fc529bf4ce45f08ff4492f081dc58766dba36e Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 20 Mar 2023 12:25:18 +0100
Subject: [PATCH 064/504] Entity linking: use `SpanGroup` instead of
 `Iterable[Span]` for mentions (#12344)

* Convert Candidate from Cython to Python class.

* Format.

* Fix .entity_ typo in _add_activations() usage.

* Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span].

* Update docs.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update doc string of BaseCandidate.__init__().

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate.

* Adjust Candidate to support and mandate numerical entity IDs.

* Format.

* Fix docstring and docs.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename alias -> mention.

* Refactor Candidate attribute names. Update docs and tests accordingly.

* Refacor Candidate attributes and their usage.

* Format.

* Fix mypy error.

* Update error code in line with v4 convention.

* Reverse erroneous changes during merge.

* Update return type in EL tests.

* Re-add Candidate to setup.py.

* Format updated docs.

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/kb/__init__.py                       |  1 -
 spacy/kb/kb.pyx                            | 10 ++---
 spacy/ml/models/entity_linker.py           |  8 ++--
 spacy/pipeline/entity_linker.py            | 45 ++++++++++------------
 spacy/tests/pipeline/test_entity_linker.py |  1 -
 website/docs/api/inmemorylookupkb.mdx      |  5 ++-
 website/docs/api/kb.mdx                    | 11 +++---
 7 files changed, 37 insertions(+), 44 deletions(-)

diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
index fb21083ddee..7155c15df9a 100644
--- a/spacy/kb/__init__.py
+++ b/spacy/kb/__init__.py
@@ -3,5 +3,4 @@
 from .kb_in_memory import InMemoryLookupKB
 from .candidate import Candidate, InMemoryCandidate
 
-
 __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index 24cee30ffc7..bb58bf88a46 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -5,8 +5,8 @@ from typing import Iterable, Tuple, Union
 
 from cymem.cymem cimport Pool
 
-from ..errors import Errors
-from ..tokens import Span
+from .candidate import Candidate
+from ..tokens import Span, SpanGroup
 from ..util import SimpleFrozenList
 from .candidate import Candidate
 
@@ -32,15 +32,13 @@ cdef class KnowledgeBase:
         self.entity_vector_length = entity_vector_length
         self.mem = Pool()
 
-    def get_candidates_batch(
-        self, mentions: Iterable[Span]
-    ) -> Iterable[Iterable[Candidate]]:
+    def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]:
         """
         Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
         entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
         probability of the specified mention text resolving to that entity - might be included.
         If no candidates are found for a given mention, an empty list is returned.
-        mentions (Iterable[Span]): Mentions for which to get candidates.
+        mentions (SpanGroup): Mentions for which to get candidates.
         RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
         """
         return [self.get_candidates(span) for span in mentions]
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 99522c4617c..db960fbd0a9 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -18,7 +18,7 @@
 from ...kb import KnowledgeBase, InMemoryLookupKB
 from ...kb import Candidate
 from ...vocab import Vocab
-from ...tokens import Span, Doc
+from ...tokens import Doc, Span, SpanGroup
 from ..extract_spans import extract_spans
 from ...errors import Errors
 from ...kb import (
@@ -135,7 +135,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
 
 @registry.misc("spacy.CandidateBatchGenerator.v1")
 def create_candidates_batch() -> Callable[
-    [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+    [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
 ]:
     return get_candidates_batch
 
@@ -151,12 +151,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
 
 
 def get_candidates_batch(
-    kb: KnowledgeBase, mentions: Iterable[Span]
+    kb: KnowledgeBase, mentions: SpanGroup
 ) -> Iterable[Iterable[Candidate]]:
     """
     Return candidate entities for the given mentions and fetching appropriate entries from the index.
     kb (KnowledgeBase): Knowledge base to query.
-    mentions (Iterable[Span]): Entity mentions for which to identify candidates.
+    mentions (SpanGroup): Entity mentions for which to identify candidates.
     RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
     """
     return kb.get_candidates_batch(mentions)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 410da58a46d..4882ead1d92 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -16,6 +16,8 @@
 
 from ..kb import KnowledgeBase, Candidate
 from ..tokens import Doc, Span
+from ..ml import empty_kb
+from ..tokens import Doc, Span, SpanGroup
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 from ..language import Language
@@ -97,7 +99,7 @@ def make_entity_linker(
     entity_vector_length: int,
     get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
     get_candidates_batch: Callable[
-        [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+        [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
     ],
     generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
     overwrite: bool,
@@ -120,7 +122,7 @@ def make_entity_linker(
     get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
         produces a list of candidates, given a certain knowledge base and a textual mention.
     get_candidates_batch (
-        Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
+        Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
         ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
     generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
     scorer (Optional[Callable]): The scoring method.
@@ -185,7 +187,7 @@ def __init__(
         entity_vector_length: int,
         get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
         get_candidates_batch: Callable[
-            [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+            [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
         ],
         overwrite: bool = False,
         scorer: Optional[Callable] = entity_linker_score,
@@ -208,7 +210,7 @@ def __init__(
         get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
             produces a list of candidates, given a certain knowledge base and a textual mention.
         get_candidates_batch (
-            Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
+            Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]],
             Iterable[Candidate]]
             ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
         overwrite (bool): Whether to overwrite existing non-empty annotations.
@@ -472,26 +474,21 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                 continue
             sentences = [s for s in doc.sents]
 
-                if self.incl_context:
-                    # get n_neighbour sentences, clipped to the length of the document
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    sent_doc = doc[start_token:end_token].as_doc()
-                    # currently, the context is the same for each entity in a sentence (should be refined)
-                    sentence_encoding = self.model.predict([sent_doc])[0]
-                    sentence_encoding_t = sentence_encoding.T
-                    sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                entity_count += 1
-                if ent.label_ in self.labels_discard:
-                    # ignoring this entity - setting to NIL
-                    final_kb_ids.append(self.NIL)
-                    self._add_activations(
-                        doc_scores=doc_scores,
-                        doc_ents=doc_ents,
-                        scores=[0.0],
-                        ents=[0],
+            # Loop over entities in batches.
+            for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
+                ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
+
+                # Look up candidate entities.
+                valid_ent_idx = [
+                    idx
+                    for idx in range(len(ent_batch))
+                    if ent_batch[idx].label_ not in self.labels_discard
+                ]
+
+                batch_candidates = list(
+                    self.get_candidates_batch(
+                        self.kb,
+                        SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
                     )
                 else:
                     candidates = list(self.get_candidates(self.kb, ent))
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 4ba7cc1a16d..170f2215f83 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -985,7 +985,6 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
-
     # Ensure that the legacy architectures still work
     vector_length = 3
     nlp = English()
diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx
index 4621d883810..712cce30747 100644
--- a/website/docs/api/inmemorylookupkb.mdx
+++ b/website/docs/api/inmemorylookupkb.mdx
@@ -189,14 +189,15 @@ to you.
 >
 > ```python
 > from spacy.lang.en import English
+> from spacy.tokens import SpanGroup
 > nlp = English()
 > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
-> candidates = kb.get_candidates((doc[0:2], doc[3:]))
+> candidates = kb.get_candidates_batch([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
 > ```
 
 | Name        | Description                                                                                                  |
 | ----------- | ------------------------------------------------------------------------------------------------------------ |
-| `mentions`  | The textual mentions. ~~Iterable[Span]~~                                                                     |
+| `mentions`  | The textual mentions. ~~SpanGroup~~                                                                          |
 | **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
 
 ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx
index 9536a3fe375..94506162f27 100644
--- a/website/docs/api/kb.mdx
+++ b/website/docs/api/kb.mdx
@@ -93,14 +93,15 @@ to you.
 >
 > ```python
 > from spacy.lang.en import English
+> from spacy.tokens import SpanGroup
 > nlp = English()
 > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
-> candidates = kb.get_candidates((doc[0:2], doc[3:]))
+> candidates = kb.get_candidates([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
 > ```
 
 | Name        | Description                                                                                  |
 | ----------- | -------------------------------------------------------------------------------------------- |
-| `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
+| `mentions`  | The textual mentions. ~~SpanGroup~~                                                          |
 | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
 
 ## KnowledgeBase.get_vector {id="get_vector",tag="method"}
@@ -187,13 +188,11 @@ Construct an `InMemoryCandidate` object. Usually this constructor is not called
 directly, but instead these objects are returned by the `get_candidates` method
 of the [`entity_linker`](/api/entitylinker) pipe.
 
-> #### Example```python
+> #### Example
 >
+> ```python
 > from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
 > entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
->
-> ```
->
 > ```
 
 | Name          | Description                                                               |

From c778a73d04e1cec3811b771198307948227a0c5f Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 27 Mar 2023 09:18:23 +0200
Subject: [PATCH 065/504] Add info that Vocab and StringStore are not static in
 docs (#12427)

* Add size increase info about vocab and stringstore

* Update website/docs/api/stringstore.mdx

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>

* Update website/docs/api/vocab.mdx

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>

* Change wording

---------

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 website/docs/api/stringstore.mdx | 2 +-
 website/docs/api/vocab.mdx       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index 269ac2d0c4b..1b1f3bd5352 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -8,7 +8,7 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
 integer IDs. This ensures that strings always map to the same ID, even from
 different `StringStores`.
 
-<Infobox variant ="warning">
+<Infobox variant="warning">
 
 Note that a `StringStore` instance is not static. It increases in size as texts
 with new tokens are processed.
diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx
index 88d3939142f..319ce88b8dc 100644
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@@ -10,7 +10,7 @@ The `Vocab` object provides a lookup table that allows you to access
 [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
 between `Doc` objects.
 
-<Infobox variant ="warning">
+<Infobox variant="warning">
 
 Note that a `Vocab` instance is not static. It increases in size as texts with
 new tokens are processed. Some models may have an empty vocab at initialization.

From 4235c55db63ddaaaabc2d7cc56e107ed44e52316 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 30 Mar 2023 09:30:42 +0200
Subject: [PATCH 066/504] Avoid `TrainablePipe.finish_update` getting called
 twice during training (#12450)

* Avoid `TrainablePipe.finish_update` getting called twice during training

PR #12136 fixed an issue where the tok2vec pipe was updated before
gradient were accumulated. However, it introduced a new bug that cause
`finish_update` to be called twice when using the training loop. This
causes a fairly large slowdown.

The `Language.update` method accepts the `sgd` argument for passing an
optimizer. This argument has three possible values:

- `Optimizer`: use the given optimizer to finish pipe updates.
- `None`: use a default optimizer to finish pipe updates.
- `False`: do not finish pipe updates.

However, the latter option was not documented and not valid with the
existing type of `sgd`. I assumed that this was a remnant of earlier
spaCy versions and removed handling of `False`.

However, with that change, we are passing `None` to `Language.update`.
As a result, we were calling `finish_update` in both `Language.update`
and in the training loop after all subbatches are processed.

This change restores proper handling/use of `False`. Moreover, the role
of `False` is now documented and added to the type to avoid future
accidents.

* Fix typo

* Document defaults for `Language.update`
---
 spacy/language.py             |  7 +++++--
 spacy/tests/test_language.py  | 18 ++++++++++++++++++
 spacy/training/loop.py        |  2 +-
 website/docs/api/language.mdx | 18 +++++++++---------
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index e8a7d719ef2..b8c4322d3b4 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1230,7 +1230,7 @@ def update(
         _: Optional[Any] = None,
         *,
         drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
+        sgd: Union[Optimizer, None, Literal[False]] = None,
         losses: Optional[Dict[str, float]] = None,
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
         exclude: Iterable[str] = SimpleFrozenList(),
@@ -1241,7 +1241,9 @@ def update(
         examples (Iterable[Example]): A batch of examples
         _: Should not be set - serves to catch backwards-incompatible scripts.
         drop (float): The dropout rate.
-        sgd (Optimizer): An optimizer.
+        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
+            be created via create_optimizer if 'None'. No optimizer will
+            be used when set to 'False'.
         losses (Dict[str, float]): Dictionary to update with the loss, keyed by
             component.
         component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
@@ -1300,6 +1302,7 @@ def update(
                 name not in exclude
                 and isinstance(proc, ty.TrainableComponent)
                 and proc.is_trainable
+                and sgd not in (None, False)
             ):
                 proc.finish_update(sgd)
 
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 88ef3d434c0..e4b06893c93 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -162,6 +162,24 @@ def test_language_update_updates():
     )
 
 
+def test_language_update_does_not_update_with_sgd_false():
+    config = Config().from_str(TAGGER_CFG_STRING)
+    nlp = load_model_from_config(config, auto_fill=True, validate=True)
+
+    train_examples = []
+    for t in TAGGER_TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+    nlp.update(train_examples, sgd=False)
+    docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+
+    xp = get_array_module(docs_after_update[0].tensor)
+    xp.testing.assert_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
+
+
 def test_language_evaluate(nlp):
     text = "hello world"
     annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index e6b3451cd73..9497b95aba5 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -221,7 +221,7 @@ def train_while_improving(
                 subbatch,
                 drop=dropout,
                 losses=losses,
-                sgd=None,
+                sgd=False,
                 exclude=exclude,
                 annotates=annotating_components,
             )
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 2a1f7a1a961..e38e49bf569 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -323,15 +323,15 @@ and custom registered functions if needed. See the
 >     nlp.update([example], sgd=optimizer)
 > ```
 
-| Name            | Description                                                                                                                                    |
-| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`      | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                              |
-| _keyword-only_  |                                                                                                                                                |
-| `drop`          | The dropout rate. ~~float~~                                                                                                                    |
-| `sgd`           | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                  |
-| `losses`        | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                |
-| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
-| **RETURNS**     | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                          |
+| Name            | Description                                                                                                                                                                                        |
+| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`      | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                                                                                  |
+| _keyword-only_  |                                                                                                                                                                                                    |
+| `drop`          | The dropout rate. Defaults to `0.0`. ~~float~~                                                                                                                                                     |
+| `sgd`           | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ |
+| `losses`        | Dictionary to update with the loss, keyed by pipeline component. Defaults to `None`. ~~Optional[Dict[str, float]]~~                                                                                |
+| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                                     |
+| **RETURNS**     | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                              |
 
 ## Language.distill {id="distill",tag="method,experimental",version="4"}
 

From 1431ee4af60d23be6f5d6c9a4baeebbb15bfc615 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 6 Apr 2023 16:01:59 +0200
Subject: [PATCH 067/504] Enforce that Span.start/end(_char) remain valid and
 in sync (#12268)

* Enforce that Span.start/end(_char) remain valid and in sync

Allowing span attributes to be writable starting in v3 has made it
possible for the internal `Span.start/end/start_char/end_char` to get
out-of-sync or have invalid values.

This checks that the values are valid and syncs the token and char
offsets if any attributes are modified directly. It does not yet handle
the case where the underlying doc is modified.

* Format
---
 spacy/errors.py              |  5 +++-
 spacy/tests/doc/test_span.py | 47 ++++++++++++++++++++++++++++++++++
 spacy/tokens/span.pyx        | 49 +++++++++++++++++++++++++++---------
 3 files changed, 88 insertions(+), 13 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 42fdc12e029..fe067f7915d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -932,7 +932,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E1029 = ("Edit tree cannot be applied to form.")
     E1030 = ("Edit tree identifier out of range.")
     E1031 = ("Could not find gold transition - see logs above.")
-    E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
+    E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.")
     E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
     E1034 = ("Node index {i} out of bounds ({length})")
     E1035 = ("Token index {i} out of bounds ({length})")
@@ -986,6 +986,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E4004 = ("Backprop is not supported when is_train is not set.")
     E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
     E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
+    E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
+             "{existing_value}.")
+    E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
 
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 74874624888..0b05ca7c123 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -696,3 +696,50 @@ def test_span_ent_id(en_tokenizer):
     doc.ents = [span]
     assert doc.ents[0].ent_id_ == "ID2"
     assert doc[1].ent_id_ == "ID2"
+
+
+def test_span_start_end_sync(en_tokenizer):
+    doc = en_tokenizer("a bc def e fghij kl")
+    # can create and edit span starts/ends
+    span = doc[2:4]
+    span.start_char = 2
+    span.end = 5
+    assert span == doc[span.start : span.end]
+    assert span == doc.char_span(span.start_char, span.end_char)
+    # cannot set completely out of bounds starts/ends
+    with pytest.raises(IndexError):
+        span.start = -1
+    with pytest.raises(IndexError):
+        span.end = -1
+    with pytest.raises(IndexError):
+        span.start_char = len(doc.text) + 1
+    with pytest.raises(IndexError):
+        span.end = len(doc.text) + 1
+    # test all possible char starts/ends
+    span = doc[0 : len(doc)]
+    token_char_starts = [token.idx for token in doc]
+    token_char_ends = [token.idx + len(token.text) for token in doc]
+    for i in range(len(doc.text)):
+        if i not in token_char_starts:
+            with pytest.raises(ValueError):
+                span.start_char = i
+        else:
+            span.start_char = i
+    span = doc[0 : len(doc)]
+    for i in range(len(doc.text)):
+        if i not in token_char_ends:
+            with pytest.raises(ValueError):
+                span.end_char = i
+        else:
+            span.end_char = i
+    # start must be <= end
+    span = doc[1:3]
+    with pytest.raises(ValueError):
+        span.start = 4
+    with pytest.raises(ValueError):
+        span.end = 0
+    span = doc.char_span(2, 8)
+    with pytest.raises(ValueError):
+        span.start_char = 9
+    with pytest.raises(ValueError):
+        span.end_char = 1
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 1378889c681..f51a1c5ee3e 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -800,36 +800,61 @@ cdef class Span:
             return self.span_c().start
 
         def __set__(self, int start):
-            if start < 0:
-                raise IndexError("TODO")
-            self.span_c().start = start
+            if start < 0 or start > self.doc.length:
+                raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
+            cdef SpanC* span_c = self.span_c()
+            if start > span_c.end:
+                raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
+            span_c.start = start
+            span_c.start_char = self.doc.c[start].idx
 
     property end:
         def __get__(self):
             return self.span_c().end
 
         def __set__(self, int end):
-            if end < 0:
-                raise IndexError("TODO")
-            self.span_c().end = end
+            if end < 0 or end > self.doc.length:
+                raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
+            cdef SpanC* span_c = self.span_c()
+            if span_c.start > end:
+                raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
+            span_c.end = end
+            if end > 0:
+                span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length
+            else:
+                span_c.end_char = 0
 
     property start_char:
         def __get__(self):
             return self.span_c().start_char
 
         def __set__(self, int start_char):
-            if start_char < 0:
-                raise IndexError("TODO")
-            self.span_c().start_char = start_char
+            if start_char < 0 or start_char > len(self.doc.text):
+                raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
+            cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
+            if start < 0:
+                raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
+            cdef SpanC* span_c = self.span_c()
+            if start_char > span_c.end_char:
+                raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char))
+            span_c.start_char = start_char
+            span_c.start = start
 
     property end_char:
         def __get__(self):
             return self.span_c().end_char
 
         def __set__(self, int end_char):
-            if end_char < 0:
-                raise IndexError("TODO")
-            self.span_c().end_char = end_char
+            if end_char < 0 or end_char > len(self.doc.text):
+                raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
+            cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
+            if end < 0:
+                raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
+            cdef SpanC* span_c = self.span_c()
+            if span_c.start_char > end_char:
+                raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char))
+            span_c.end_char = end_char
+            span_c.end = end
 
     property label:
         def __get__(self):

From 030dda16162b25e92b2ea9fb7c4e100003fc6b43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 21 Apr 2023 13:49:40 +0200
Subject: [PATCH 068/504] Add distillation loop (#12542)

* Add distillation initialization and loop

* Fix up configuration keys

* Add docstring

* Type annotations

* init_nlp_distill -> init_nlp_student

* Do not resolve dot name distill corpus in initialization

(Since we don't use it.)

* student: do not request use of optimizer in student pipe

We apply finish up the updates once in the training loop instead.

Also add the necessary logic to `Language.distill` to mirror
`Language.update`.

* Correctly determine sort key in subdivide_batch

* Fix _distill_loop docstring wrt. stopping condition

* _distill_loop: fix distill_data docstring

Make similar changes in train_while_improving, since it also had
incorrect types and missing type annotations.

* Move `set_{gpu_allocator,seed}_from_config` to spacy.util

* Update Language.update docs for the sgd argument

* Type annotation

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/language.py                 |  26 ++-
 spacy/schemas.py                  |   2 +-
 spacy/tests/training/test_loop.py | 111 +++++++++++
 spacy/training/initialize.py      | 134 ++++++++++---
 spacy/training/loop.py            | 317 +++++++++++++++++++++++++++---
 spacy/util.py                     |  20 ++
 website/docs/api/language.mdx     |  26 +--
 7 files changed, 560 insertions(+), 76 deletions(-)
 create mode 100644 spacy/tests/training/test_loop.py

diff --git a/spacy/language.py b/spacy/language.py
index b8c4322d3b4..028f733200e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1052,7 +1052,7 @@ def distill(
         examples: Iterable[Example],
         *,
         drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
+        sgd: Union[Optimizer, None, Literal[False]] = None,
         losses: Optional[Dict[str, float]] = None,
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
         exclude: Iterable[str] = SimpleFrozenList(),
@@ -1065,7 +1065,9 @@ def distill(
             (teacher) and predicted (student) docs must have the same number of
             tokens and the same orthography.
         drop (float): The dropout rate.
-        sgd (Optional[Optimizer]): An optimizer.
+        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
+            be created via create_optimizer if 'None'. No optimizer will
+            be used when set to 'False'.
         losses (Optional(Dict[str, float])): Dictionary to update with the loss,
             keyed by component.
         component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
@@ -1135,11 +1137,23 @@ def distill(
                 student_proc.distill(
                     teacher_pipe,
                     examples,
-                    sgd=sgd,
+                    sgd=None,
                     losses=losses,
                     **component_cfg[student_name],
                 )
 
+        # Only finish the update after all component updates are done. Some
+        # components may share weights (such as tok2vec) and we only want
+        # to apply weight updates after all gradients are accumulated.
+        for student_name, student_proc in self.pipeline:
+            if (
+                student_name not in exclude
+                and isinstance(student_proc, ty.DistillableComponent)
+                and student_proc.is_distillable
+                and sgd not in (None, False)
+            ):
+                student_proc.finish_update(sgd)
+
         return losses
 
     def disable_pipes(self, *names) -> "DisabledPipes":
@@ -1908,7 +1922,7 @@ def from_config(
         # using the nlp.config with all defaults.
         config = util.copy_config(config)
         orig_pipeline = config.pop("components", {})
-        orig_distill = config.pop("distill", None)
+        orig_distill = config.pop("distillation", None)
         orig_pretraining = config.pop("pretraining", None)
         config["components"] = {}
         if auto_fill:
@@ -1918,8 +1932,8 @@ def from_config(
         filled["components"] = orig_pipeline
         config["components"] = orig_pipeline
         if orig_distill is not None:
-            filled["distill"] = orig_distill
-            config["distill"] = orig_distill
+            filled["distillation"] = orig_distill
+            config["distillation"] = orig_distill
         if orig_pretraining is not None:
             filled["pretraining"] = orig_pretraining
             config["pretraining"] = orig_pretraining
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 32fb042b5a0..7fc5ec20e51 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -510,7 +510,7 @@ class Config:
     "training": ConfigSchemaTraining,
     "pretraining": ConfigSchemaPretrain,
     "initialize": ConfigSchemaInit,
-    "distill": ConfigSchemaDistill,
+    "distillation": ConfigSchemaDistill,
 }
 
 # Recommendations for init config workflows
diff --git a/spacy/tests/training/test_loop.py b/spacy/tests/training/test_loop.py
new file mode 100644
index 00000000000..46d01509504
--- /dev/null
+++ b/spacy/tests/training/test_loop.py
@@ -0,0 +1,111 @@
+from typing import Callable, Iterable, Iterator
+import pytest
+from spacy import Language
+from spacy.training import Example
+from spacy.training.initialize import init_nlp_student
+from spacy.training.loop import distill, train
+from spacy.util import load_model_from_config, registry
+from thinc.api import Config
+
+
+@pytest.fixture
+def config_str():
+    return """
+    [nlp]
+    lang = "en"
+    pipeline = ["senter"]
+    disabled = []
+    before_creation = null
+    after_creation = null
+    after_pipeline_creation = null
+    batch_size = 1000
+    tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+    [components]
+
+    [components.senter]
+    factory = "senter"
+
+    [training]
+    dev_corpus = "corpora.dev"
+    train_corpus = "corpora.train"
+    max_steps = 50
+    seed = 1
+    gpu_allocator = null
+
+    [distillation]
+    corpus = "corpora.train"
+    dropout = 0.1
+    max_epochs = 0
+    max_steps = 50
+    student_to_teacher = {}
+
+    [distillation.batcher]
+    @batchers = "spacy.batch_by_words.v1"
+    size = 3000
+    discard_oversize = false
+    tolerance = 0.2
+
+    [distillation.optimizer]
+    @optimizers = "Adam.v1"
+    beta1 = 0.9
+    beta2 = 0.999
+    L2_is_weight_decay = true
+    L2 = 0.01
+    grad_clip = 1.0
+    use_averages = true
+    eps = 1e-8
+    learn_rate = 1e-4
+
+    [corpora]
+
+    [corpora.dev]
+    @readers = "sentence_corpus"
+
+    [corpora.train]
+    @readers = "sentence_corpus"
+    """
+
+
+SENT_STARTS = [0] * 14
+SENT_STARTS[0] = 1
+SENT_STARTS[5] = 1
+SENT_STARTS[9] = 1
+
+TRAIN_DATA = [
+    (
+        "I like green eggs. Eat blue ham. I like purple eggs.",
+        {"sent_starts": SENT_STARTS},
+    ),
+    (
+        "She likes purple eggs. They hate ham. You like yellow eggs.",
+        {"sent_starts": SENT_STARTS},
+    ),
+]
+
+
+@pytest.mark.slow
+def test_distill_loop(config_str):
+    @registry.readers("sentence_corpus")
+    def create_sentence_corpus() -> Callable[[Language], Iterable[Example]]:
+        return SentenceCorpus()
+
+    class SentenceCorpus:
+        def __call__(self, nlp: Language) -> Iterator[Example]:
+            for t in TRAIN_DATA:
+                yield Example.from_dict(nlp.make_doc(t[0]), t[1])
+
+    orig_config = Config().from_str(config_str)
+    teacher = load_model_from_config(orig_config, auto_fill=True, validate=True)
+    teacher.initialize()
+    train(teacher)
+
+    orig_config = Config().from_str(config_str)
+    student = init_nlp_student(orig_config, teacher)
+    student.initialize()
+    distill(teacher, student)
+
+    doc = student(TRAIN_DATA[0][0])
+    assert doc.sents[0].text == "I like green eggs."
+    assert doc.sents[1].text == "Eat blue ham."
+    assert doc.sents[2].text == "I like purple eggs."
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 191821e786e..61ad1c09cc0 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -1,3 +1,9 @@
+from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING
+from thinc.api import Config, ConfigValidationError
+from pathlib import Path
+import srsly
+import numpy
+import tarfile
 import gzip
 import tarfile
 import warnings
@@ -12,22 +18,11 @@
 from thinc.api import Config, ConfigValidationError, fix_random_seed, set_gpu_allocator
 
 from ..errors import Errors, Warnings
-from ..lookups import Lookups
-from ..schemas import ConfigSchemaTraining
-from ..util import (
-    DEFAULT_OOV_PROB,
-    OOV_RANK,
-    ensure_path,
-    get_sourced_components,
-    load_model,
-    load_model_from_config,
-    logger,
-    registry,
-    resolve_dot_names,
-)
-from ..vectors import Mode as VectorsMode
-from ..vectors import Vectors
-from .pretrain import get_tok2vec_ref
+from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
+from ..util import registry, load_model_from_config, resolve_dot_names, logger
+from ..util import load_model, ensure_path, get_sourced_components
+from ..util import OOV_RANK, DEFAULT_OOV_PROB
+from ..util import set_gpu_allocator_from_config, set_seed_from_config
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
@@ -36,15 +31,8 @@
 def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     raw_config = config
     config = raw_config.interpolate()
-    if "seed" not in config["training"]:
-        raise ValueError(Errors.E1015.format(value="[training] seed"))
-    if "gpu_allocator" not in config["training"]:
-        raise ValueError(Errors.E1015.format(value="[training] gpu_allocator"))
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
+    set_seed_from_config(config)
+    set_gpu_allocator_from_config(config, use_gpu)
     # Use original config here before it's resolved to functions
     sourced = get_sourced_components(config)
     nlp = load_model_from_config(raw_config, auto_fill=True)
@@ -111,6 +99,102 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     return nlp
 
 
+def init_nlp_student(
+    config: Config, teacher: "Language", *, use_gpu: int = -1
+) -> "Language":
+    """Initialize student pipeline for distillation.
+
+    config (Config): Student model configuration.
+    teacher (Language): The teacher pipeline to distill from.
+    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
+        before calling this function.
+    """
+    raw_config = config
+    config = raw_config.interpolate()
+    set_seed_from_config(config)
+    set_gpu_allocator_from_config(config, use_gpu)
+
+    # Use original config here before it's resolved to functions
+    sourced = get_sourced_components(config)
+    nlp = load_model_from_config(raw_config, auto_fill=True)
+    logger.info("Set up nlp object from config")
+    config = nlp.config.interpolate()
+    # Resolve all training-relevant sections using the filled nlp config
+    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    D = registry.resolve(config["distillation"], schema=ConfigSchemaDistill)
+    dot_names = [T["dev_corpus"]]
+    if not isinstance(D["corpus"], str):
+        raise ConfigValidationError(
+            desc=Errors.E897.format(field="distillation.corpus", type=type(D["corpus"]))
+        )
+    if not isinstance(T["dev_corpus"], str):
+        raise ConfigValidationError(
+            desc=Errors.E897.format(
+                field="training.dev_corpus", type=type(T["dev_corpus"])
+            )
+        )
+    (dev_corpus,) = resolve_dot_names(config, dot_names)
+    optimizer = T["optimizer"]
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Sourced components that require resume_training
+    resume_components = [p for p in sourced if p not in frozen_components]
+    logger.info(f"Pipeline: {nlp.pipe_names}")
+    if resume_components:
+        with nlp.select_pipes(enable=resume_components):
+            logger.info(f"Resuming training for: {resume_components}")
+            nlp.resume_training(sgd=optimizer)
+    # Make sure that listeners are defined before initializing further
+    nlp._link_components()
+
+    # Get teacher labels to initialize student with.
+    student_to_teacher = D["student_to_teacher"]
+    teacher_pipes = dict(teacher.pipeline)
+    labels = {}
+    for name, pipe in nlp.pipeline:
+        # Copy teacher labels.
+        teacher_pipe_name = (
+            student_to_teacher[name] if name in student_to_teacher else name
+        )
+        teacher_pipe = teacher_pipes.get(teacher_pipe_name, None)
+        if (
+            teacher_pipe is not None
+            and getattr(teacher_pipe, "label_data", None) is not None
+        ):
+            labels[name] = teacher_pipe.label_data  # type: ignore[attr-defined]
+
+    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
+        # Initialize on the dev corpus, since the distillation corpus does
+        # usually not have labels. Since we copy the labels from the teacher
+        # pipe, the dev data does not have to be exhaustive.
+        if T["max_epochs"] == -1:
+            sample_size = 100
+            logger.debug(
+                f"Due to streamed train corpus, using only first {sample_size} "
+                f"examples for initialization. If necessary, provide all labels "
+                f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
+            )
+            nlp.initialize(lambda: islice(dev_corpus(nlp), sample_size), sgd=optimizer)
+        else:
+            nlp.initialize(lambda: dev_corpus(nlp), sgd=optimizer, labels=labels)
+        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
+    # Detect components with listeners that are not frozen consistently
+    for name, proc in nlp.pipeline:
+        for listener in getattr(
+            proc, "listening_components", []
+        ):  # e.g. tok2vec/transformer
+            # Don't warn about components not in the pipeline
+            if listener not in nlp.pipe_names:
+                continue
+            if listener in frozen_components and name not in frozen_components:
+                logger.warning(Warnings.W087.format(name=name, listener=listener))
+            # We always check this regardless, in case user freezes tok2vec
+            if listener not in frozen_components and name in frozen_components:
+                if name not in T["annotating_components"]:
+                    logger.warning(Warnings.W086.format(name=name, listener=listener))
+    return nlp
+
+
 def init_vocab(
     nlp: "Language",
     *,
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 9497b95aba5..ad162678fec 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -3,26 +3,20 @@
 import sys
 from pathlib import Path
 from timeit import default_timer as timer
-from typing import (
-    IO,
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Tuple,
-    Union,
-)
-
-from thinc.api import Config, Optimizer, constant, fix_random_seed, set_gpu_allocator
+from thinc.api import Optimizer, Config, constant
 from wasabi import Printer
+import random
+import sys
+import shutil
+
 
-from ..errors import Errors
-from ..schemas import ConfigSchemaTraining
-from ..util import logger, registry, resolve_dot_names
 from .example import Example
+from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
+from ..errors import Errors
+from ..tokens.doc import Doc
+from .. import ty
+from ..util import resolve_dot_names, registry, logger
+from ..util import set_gpu_allocator_from_config, set_seed_from_config
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
@@ -32,6 +26,129 @@
 DIR_MODEL_LAST = "model-last"
 
 
+def distill(
+    teacher: "Language",
+    student: "Language",
+    output_path: Optional[Path] = None,
+    *,
+    use_gpu: int = -1,
+    stdout: IO = sys.stdout,
+    stderr: IO = sys.stderr,
+) -> Tuple["Language", Optional[Path]]:
+    """Distill a student pipeline from a teacher pipeline.
+
+    teacher (Language): The teacher pipeline to distill from.
+    student (Language): The student pipeline to distill into.
+    output_path (Optional[Path]): Optional output path to save the student
+        model to.
+    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
+        before calling this function.
+    stdout (file): A file-like object to write output messages. To disable
+        printing, set to io.StringIO.
+    stderr (file): A second file-like object to write output messages. To disable
+        printing, set to io.StringIO.
+
+    RETURNS (tuple): The final student nlp object and the path to the exported
+        student model.
+    """
+    # We use no_print here so we can respect the stdout/stderr options.
+    msg = Printer(no_print=True)
+    # Create iterator, which yields out info after each optimization step.
+    config = student.config.interpolate()
+    set_seed_from_config(config)
+    set_gpu_allocator_from_config(config, use_gpu)
+    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    D = registry.resolve(config["distillation"], schema=ConfigSchemaDistill)
+    dot_names = [D["corpus"], T["dev_corpus"]]
+    distill_corpus, dev_corpus = resolve_dot_names(config, dot_names)
+    optimizer = D["optimizer"]
+    score_weights = T["score_weights"]
+    batcher = D["batcher"]
+    train_logger = T["logger"]
+    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+    before_update = T["before_update"]
+    student_to_teacher = D["student_to_teacher"]
+
+    # Helper function to save checkpoints. This is a closure for convenience,
+    # to avoid passing in all the args all the time.
+    def save_checkpoint(is_best):
+        with student.use_params(optimizer.averages):
+            before_to_disk(student).to_disk(output_path / DIR_MODEL_LAST)
+        if is_best:
+            # Avoid saving twice (saving will be more expensive than
+            # the dir copy)
+            if (output_path / DIR_MODEL_BEST).exists():
+                shutil.rmtree(output_path / DIR_MODEL_BEST)
+            shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST)
+
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Components that should set annotations on update
+    annotating_components = T["annotating_components"]
+    # Create iterator, which yields out info after each optimization step.
+    training_step_iterator = _distill_loop(
+        teacher,
+        student,
+        optimizer,
+        create_distill_batches(student, distill_corpus, batcher, D["max_epochs"]),
+        create_evaluation_callback(student, dev_corpus, score_weights),
+        dropout=D["dropout"],
+        accumulate_gradient=T["accumulate_gradient"],
+        max_steps=D["max_steps"],
+        eval_frequency=T["eval_frequency"],
+        exclude=frozen_components,
+        annotating_components=annotating_components,
+        before_update=before_update,
+        student_to_teacher=student_to_teacher,
+    )
+    clean_output_dir(output_path)
+    stdout.write(msg.info(f"Teacher pipeline: {teacher.pipe_names}") + "\n")
+    stdout.write(msg.info(f"Student pipeline: {student.pipe_names}") + "\n")
+    if frozen_components:
+        stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
+    if annotating_components:
+        stdout.write(
+            msg.info(f"Set annotations on update for: {annotating_components}") + "\n"
+        )
+    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate(step=0)}") + "\n")
+    with student.select_pipes(disable=frozen_components):
+        log_step, finalize_logger = train_logger(student, stdout, stderr)
+    try:
+        for batch, info, is_best_checkpoint in training_step_iterator:
+            if is_best_checkpoint is not None:
+                with student.select_pipes(disable=frozen_components):
+                    update_meta(T, student, info)
+                if output_path is not None:
+                    save_checkpoint(is_best_checkpoint)
+                    info["output_path"] = str(output_path / DIR_MODEL_LAST)
+            log_step(info if is_best_checkpoint is not None else None)
+    except Exception as e:
+        if output_path is not None:
+            stdout.write(
+                msg.warn(
+                    f"Aborting and saving the final best model. "
+                    f"Encountered exception: {repr(e)}"
+                )
+                + "\n"
+            )
+        raise e
+    finally:
+        finalize_logger()
+        if output_path is not None:
+            save_checkpoint(False)
+    # This will only run if we did't hit an error
+    if optimizer.averages:
+        student.use_params(optimizer.averages)
+    if output_path is not None:
+        stdout.write(
+            msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST)
+            + "\n"
+        )
+        return (student, output_path / DIR_MODEL_LAST)
+    else:
+        return (student, None)
+
+
 def train(
     nlp: "Language",
     output_path: Optional[Path] = None,
@@ -57,11 +174,8 @@ def train(
     msg = Printer(no_print=True)
     # Create iterator, which yields out info after each optimization step.
     config = nlp.config.interpolate()
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
+    set_seed_from_config(config)
+    set_gpu_allocator_from_config(config, use_gpu)
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     dot_names = [T["train_corpus"], T["dev_corpus"]]
     train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
@@ -150,11 +264,131 @@ def save_checkpoint(is_best):
         return (nlp, None)
 
 
+def _distill_loop(
+    teacher: "Language",
+    student: "Language",
+    optimizer: Optimizer,
+    distill_data: Iterable[List[Example]],
+    evaluate: Callable[[], Tuple[float, Dict[str, float]]],
+    *,
+    dropout: float,
+    eval_frequency: int,
+    accumulate_gradient: int,
+    max_steps: int,
+    exclude: List[str],
+    annotating_components: List[str],
+    before_update: Optional[Callable[["Language", Dict[str, Any]], None]],
+    student_to_teacher: Dict[str, str],
+):
+    """Distill until the data is exhausted or the maximum number of steps
+    has been reached. Works as a generator, with each iteration yielding
+    a tuple `(batch, info, is_best_checkpoint)`, where info is a dict, and
+    is_best_checkpoint is in [True, False, None] -- None indicating that
+    the iteration was not evaluated as a checkpoint. The evaluation is
+    conducted by calling the evaluate callback.
+
+    Positional arguments:
+        teacher (Language): The teacher pipeline to distill from.
+        student (Language): The student pipeline to distill into.
+        optimizer: The optimizer callable.
+        distill_data (Iterable[List[Example]]): A generator of batches,
+            with the distillation data. The distillation data iterable
+            needs to take care of iterating over the epochs and shuffling.
+        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
+            The callback should take no arguments and return a tuple
+            `(main_score, other_scores)`. The main_score should be a float where
+            higher is better. other_scores can be any object.
+
+    Every iteration, the function yields out a tuple with:
+
+    * batch: A list of Example objects.
+    * info: A dict with various information about the last update (see below).
+    * is_best_checkpoint: A value in None, False, True, indicating whether this
+        was the best evaluation so far. You should use this to save the model
+        checkpoints during training. If None, evaluation was not conducted on
+        that iteration. False means evaluation was conducted, but a previous
+        evaluation was better.
+
+    The info dict provides the following information:
+
+        epoch (int): How many passes over the data have been completed.
+        step (int): How many steps have been completed.
+        score (float): The main score from the last evaluation.
+        other_scores: : The other scores from the last evaluation.
+        losses: The accumulated losses throughout training.
+        checkpoints: A list of previous results, where each result is a
+            (score, step, epoch) tuple.
+    """
+    if isinstance(dropout, float):
+        dropouts = constant(dropout)
+    else:
+        dropouts = dropout
+    results = []
+    losses: Dict[str, float] = {}
+    words_seen = 0
+    start_time = timer()
+    for step, (epoch, batch) in enumerate(distill_data):
+        if before_update:
+            before_update_args = {"step": step, "epoch": epoch}
+            before_update(student, before_update_args)
+        dropout = dropouts(optimizer.step)
+        for subbatch in subdivide_batch(batch, accumulate_gradient):
+            student.distill(
+                teacher,
+                subbatch,
+                drop=dropout,
+                losses=losses,
+                sgd=False,
+                exclude=exclude,
+                annotates=annotating_components,
+                student_to_teacher=student_to_teacher,
+            )
+        # TODO: refactor this so we don't have to run it separately in here
+        for student_name, student_proc in student.pipeline:
+            if (
+                student_name not in exclude
+                and isinstance(student_proc, ty.DistillableComponent)
+                and student_proc.is_distillable
+                and student_proc.model not in (False, None)  # type: ignore[attr-defined]
+            ):
+                student_proc.finish_update(optimizer)  # type: ignore[attr-defined]
+        optimizer.step_schedules()
+        if not (step % eval_frequency):
+            if optimizer.averages:
+                with student.use_params(optimizer.averages):
+                    score, other_scores = evaluate()
+            else:
+                score, other_scores = evaluate()
+            optimizer.last_score = score  # type: ignore[assignment]
+            results.append((score, step))
+            is_best_checkpoint = score == max(results)[0]
+        else:
+            score, other_scores = (None, None)
+            is_best_checkpoint = None
+        words_seen += sum(len(eg) for eg in batch)
+        info = {
+            "epoch": epoch,
+            "step": step,
+            "score": score,
+            "other_scores": other_scores,
+            "losses": losses,
+            "checkpoints": results,
+            "seconds": int(timer() - start_time),
+            "words": words_seen,
+        }
+        yield batch, info, is_best_checkpoint
+        if is_best_checkpoint is not None:
+            losses = {}
+        # Stop if we've exhausted our max steps (if specified)
+        if max_steps and step >= max_steps:
+            break
+
+
 def train_while_improving(
     nlp: "Language",
     optimizer: Optimizer,
-    train_data,
-    evaluate,
+    train_data: Iterable[List[Example]],
+    evaluate: Callable[[], Tuple[float, Dict[str, float]]],
     *,
     dropout: float,
     eval_frequency: int,
@@ -174,10 +408,9 @@ def train_while_improving(
     Positional arguments:
         nlp: The spaCy pipeline to evaluate.
         optimizer: The optimizer callable.
-        train_data (Iterable[Batch]): A generator of batches, with the training
-            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
-            data iterable needs to take care of iterating over the epochs and
-            shuffling.
+        train_data (Iterable[List[Example]]): A generator of batches, with the
+            training data. The training data iterable needs to take care of
+            iterating over the epochs and shuffling.
         evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
             The callback should take no arguments and return a tuple
             `(main_score, other_scores)`. The main_score should be a float where
@@ -241,7 +474,7 @@ def train_while_improving(
                     score, other_scores = evaluate()
             else:
                 score, other_scores = evaluate()
-            optimizer.last_score = score
+            optimizer.last_score = score  # type: ignore[assignment]
             results.append((score, step))
             is_best_checkpoint = score == max(results)[0]
         else:
@@ -273,9 +506,15 @@ def train_while_improving(
             break
 
 
-def subdivide_batch(batch, accumulate_gradient):
+def subdivide_batch(
+    batch: Union[Iterable[Doc], Iterable[Example]], accumulate_gradient: int
+):
     batch = list(batch)
-    batch.sort(key=lambda eg: len(eg.predicted))
+    if len(batch):
+        if isinstance(batch[0], Example):
+            batch.sort(key=lambda eg: len(eg.predicted))
+        else:
+            batch.sort(key=lambda doc: len(doc))
     sub_len = len(batch) // accumulate_gradient
     start = 0
     for i in range(accumulate_gradient):
@@ -320,6 +559,22 @@ def evaluate() -> Tuple[float, Dict[str, float]]:
     return evaluate
 
 
+def create_distill_batches(
+    nlp: "Language",
+    corpus: Callable[["Language"], Iterable[Example]],
+    batcher: Callable[[Iterable[Example]], Iterable[List[Example]]],
+    max_epochs: int,
+):
+    """Create distillation batches. In contrast to training, the corpus
+    is normally too large to load into memory and shuffle."""
+    epoch = 0
+    while max_epochs < 1 or epoch != max_epochs:
+        examples = corpus(nlp)
+        for batch in batcher(examples):
+            yield epoch, batch
+        epoch += 1
+
+
 def create_train_batches(
     nlp: "Language",
     corpus: Callable[["Language"], Iterable[Example]],
diff --git a/spacy/util.py b/spacy/util.py
index 7448da8ded0..3bb92e7334c 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -7,6 +7,7 @@
 import thinc
 from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
 from thinc.api import ConfigValidationError, Model, constant as constant_schedule
+from thinc.api import fix_random_seed, set_gpu_allocator
 import functools
 import itertools
 import logging
@@ -1821,3 +1822,22 @@ def find_available_port(start: int, host: str, auto_select: bool = False) -> int
     # if we get here, the port changed
     warnings.warn(Warnings.W124.format(host=host, port=start, serve_port=port))
     return port
+
+
+def set_gpu_allocator_from_config(config: Config, use_gpu: int):
+    """Change the global GPU allocator based to the value in
+    the configuration."""
+    if "gpu_allocator" not in config["training"]:
+        raise ValueError(Errors.E1015.format(value="[training] gpu_allocator"))
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+
+
+def set_seed_from_config(config: Config):
+    """Set the random number generator seed to the value in
+    the configuration."""
+    if "seed" not in config["training"]:
+        raise ValueError(Errors.E1015.format(value="[training] seed"))
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index e38e49bf569..82cb1c14cef 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -347,19 +347,19 @@ Distill the models in a student pipeline from a teacher pipeline.
 > student.distill(teacher, examples, sgd=optimizer)
 > ```
 
-| Name                 | Description                                                                                                                                                                                 |
-| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher`            | The teacher pipeline to distill from. ~~Language~~                                                                                                                                          |
-| `examples`           | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
-| _keyword-only_       |                                                                                                                                                                                             |
-| `drop`               | The dropout rate. ~~float~~                                                                                                                                                                 |
-| `sgd`                | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
-| `losses`             | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                                                             |
-| `component_cfg`      | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                              |
-| `exclude`            | Names of components that shouldn't be updated. Defaults to `[]`. ~~Iterable[str]~~                                                                                                          |
-| `annotates`          | Names of components that should set annotations on the prediced examples after updating. Defaults to `[]`. ~~Iterable[str]~~                                                                |
-| `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                              |
-| **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+| Name                 | Description                                                                                                                                                                                        |
+| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher`            | The teacher pipeline to distill from. ~~Language~~                                                                                                                                                 |
+| `examples`           | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~        |
+| _keyword-only_       |                                                                                                                                                                                                    |
+| `drop`               | The dropout rate. ~~float~~                                                                                                                                                                        |
+| `sgd`                | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ |
+| `losses`             | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                                                                    |
+| `component_cfg`      | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                                     |
+| `exclude`            | Names of components that shouldn't be updated. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                 |
+| `annotates`          | Names of components that should set annotations on the prediced examples after updating. Defaults to `[]`. ~~Iterable[str]~~                                                                       |
+| `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                                     |
+| **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                              |
 
 ## Language.rehearse {id="rehearse",tag="method,experimental",version="3"}
 

From 5b90efe5f84f76c983d81a43e329b237aa8280d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 12 Jun 2023 16:16:03 +0200
Subject: [PATCH 069/504] Remove Python 3.7 builds

---
 .github/workflows/tests.yml               | 61 +++++++++++------------
 .github/workflows/universe_validation.yml |  2 +-
 build-constraints.txt                     |  4 +-
 requirements.txt                          |  2 +-
 4 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 840b8e5f968..760a79f2121 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Configure Python version
         uses: actions/setup-python@v4
         with:
-          python-version: "3.7"
+          python-version: "3.8"
           architecture: x64
 
       - name: black
@@ -60,11 +60,9 @@ jobs:
         os: [ubuntu-latest, windows-latest, macos-latest]
         python_version: ["3.12"]
         include:
-          - os: windows-latest
-            python_version: "3.7"
           - os: macos-latest
             python_version: "3.8"
-          - os: ubuntu-latest
+          - os: ubuntu-20.04
             python_version: "3.9"
           - os: windows-latest
             python_version: "3.10"
@@ -95,7 +93,6 @@ jobs:
       - name: Run mypy
         run: |
           python -m mypy spacy
-        if: matrix.python_version != '3.7'
 
       - name: Delete source directory and .egg-info
         run: |
@@ -117,22 +114,22 @@ jobs:
       - name: Test import
         run: python -W error -c "import spacy"
 
-      - name: "Test download CLI"
-        run: |
-          python -m spacy download ca_core_news_sm
-          python -m spacy download ca_core_news_md
-          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
-
-      - name: "Test download_url in info CLI"
-        run: |
-          python -W error -m spacy info ca_core_news_sm | grep -q download_url
-        if: matrix.python_version == '3.9'
-
-      - name: "Test no warnings on load (#11713)"
-        run: |
-          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
+      #      - name: "Test download CLI"
+      #        run: |
+      #          python -m spacy download ca_core_news_sm
+      #          python -m spacy download ca_core_news_md
+      #          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test download_url in info CLI"
+      #        run: |
+      #          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test no warnings on load (#11713)"
+      #        run: |
+      #          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+      #        if: matrix.python_version == '3.9'
 
       - name: "Test convert CLI"
         run: |
@@ -156,17 +153,17 @@ jobs:
           python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
         if: matrix.python_version == '3.9'
 
-      - name: "Test assemble CLI"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-        if: matrix.python_version == '3.9'
-
-      - name: "Test assemble CLI vectors warning"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-        if: matrix.python_version == '3.9'
+      #      - name: "Test assemble CLI"
+      #        run: |
+      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      #          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test assemble CLI vectors warning"
+      #        run: |
+      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+      #          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+      #        if: matrix.python_version == '3.9'
 
       - name: "Install test requirements"
         run: |
diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml
index a1e3253a9ba..c5e68784e00 100644
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@@ -25,7 +25,7 @@ jobs:
       - name: Configure Python version
         uses: actions/setup-python@v4
         with:
-          python-version: "3.7"
+          python-version: "3.8"
           architecture: x64
 
       - name: Validate website/meta/universe.json
diff --git a/build-constraints.txt b/build-constraints.txt
index b1cf596ca7c..781e403c59a 100644
--- a/build-constraints.txt
+++ b/build-constraints.txt
@@ -1,6 +1,4 @@
-# build version constraints for use with wheelwright
-numpy==1.15.0; python_version=='3.7' and platform_machine!='aarch64'
-numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64'
+# build version constraints for use with wheelwright + multibuild
 numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 numpy>=1.25.0; python_version>='3.9'
diff --git a/requirements.txt b/requirements.txt
index 6ad10b1d1c4..37179bc79d0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,7 +31,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<0.1000; platform_machine != "aarch64"
+mypy>=0.990,<1.1.0; platform_machine != "aarch64"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests

From c7ae373f3722bc1906039d7a8fc326b5d68b8efd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 12 Jun 2023 16:43:05 +0200
Subject: [PATCH 070/504] spancat type fixes

---
 spacy/pipeline/spancat.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index bfaaf82e8d0..5c450f36a33 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -514,10 +514,9 @@ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> Non
         indices = activations["indices"]
         assert isinstance(indices, Ragged)
         scores = cast(Floats2d, activations["scores"])
-
         offset = 0
         for i, doc in enumerate(docs):
-            indices_i = indices[i].dataXd
+            indices_i = cast(Ints2d, indices[i].dataXd)
             if self.save_activations:
                 doc.activations[self.name] = {}
                 doc.activations[self.name]["indices"] = indices_i

From 08c89524e59bd4ccaf390380f26f146e90ec6ee7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 22 Jun 2023 15:38:22 +0200
Subject: [PATCH 071/504] Account for differences between Span.sents in spaCy
 3/4

---
 spacy/tokens/span.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index f51a1c5ee3e..6b7782b788b 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -520,13 +520,13 @@ cdef class Span:
                     start = i
                     if start >= self.end:
                         break
-            if start < self.end:
-                spans.append(Span(self.doc, start, self.end))
-        return tuple(spans)
+                elif i == self.doc.length - 1:
+                    spans.append(Span(self.doc, start, self.doc.length))
 
             # Ensure that trailing parts of the Span instance are included in last element of .sents.
             if start == self.doc.length - 1:
-                yield Span(self.doc, start, self.doc.length)
+                spans.append(Span(self.doc, start, self.doc.length))
+        return tuple(spans)
 
     @property
     def ents(self):

From 072b309e7820ba2b247cb37438b4123389ebfdc6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 23 Jun 2023 09:43:41 +0200
Subject: [PATCH 072/504] Set version to v4.0.0.dev1 (#12748)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 1ce8a44c9a4..ec1dde7cae6 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,5 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "4.0.0.dev0"
+__version__ = "4.0.0.dev1"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 10653fac1238c64b20a180ae2d6fa3b360220ff9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 11:41:03 +0200
Subject: [PATCH 073/504] isort all the things

---
 spacy/cli/__init__.py                         |  21 +-
 spacy/cli/_util.py                            |  21 +-
 spacy/cli/convert.py                          |   4 +-
 spacy/cli/debug_data.py                       |  10 +-
 spacy/cli/download.py                         |  12 +-
 spacy/cli/info.py                             |   2 +-
 spacy/cli/init_config.py                      |  13 +-
 spacy/cli/init_pipeline.py                    |  13 +-
 spacy/cli/project/assets.py                   | 218 +++++++++-
 spacy/cli/project/clone.py                    | 125 +++++-
 spacy/cli/project/document.py                 | 116 +++++-
 spacy/cli/project/dvc.py                      | 221 +++++++++-
 spacy/cli/project/pull.py                     |  68 +++-
 spacy/cli/project/push.py                     |  70 +++-
 spacy/cli/project/remote_storage.py           | 213 +++++++++-
 spacy/cli/project/run.py                      | 380 +++++++++++++++++-
 spacy/displacy/render.py                      |   1 +
 spacy/errors.py                               |   2 +-
 spacy/kb/__init__.py                          |   3 +-
 spacy/kb/candidate.pxd                        |   4 +-
 spacy/kb/candidate.pyx                        |   1 +
 spacy/kb/kb.pyx                               |   2 +-
 spacy/kb/kb_in_memory.pyx                     |   5 +-
 spacy/language.py                             |  61 +--
 spacy/lexeme.pxd                              |  17 +-
 spacy/lexeme.pyx                              |   3 +-
 spacy/matcher/dependencymatcher.pyx           |   2 +-
 spacy/matcher/matcher.pyi                     |  17 +-
 spacy/matcher/matcher.pyx                     |  26 +-
 spacy/matcher/phrasematcher.pyi               |   6 +-
 spacy/matcher/phrasematcher.pyx               |   8 +-
 spacy/ml/models/entity_linker.py              |  16 +-
 spacy/ml/models/parser.py                     |   9 +-
 spacy/ml/models/tok2vec.py                    |   2 -
 spacy/ml/staticvectors.py                     |   6 +-
 spacy/ml/tb_framework.pyx                     |  37 +-
 spacy/morphology.pxd                          |   2 +-
 spacy/morphology.pyx                          |   8 +-
 .../pipeline/_edit_tree_internals/schemas.py  |   8 +-
 .../_parser_internals/_beam_utils.pxd         |   1 +
 .../_parser_internals/_beam_utils.pyx         |  11 +-
 spacy/pipeline/_parser_internals/_state.pxd   |   2 -
 .../pipeline/_parser_internals/arc_eager.pyx  |   5 +-
 spacy/pipeline/_parser_internals/ner.pyx      |   9 +-
 spacy/pipeline/_parser_internals/search.pxd   |   6 +-
 spacy/pipeline/_parser_internals/search.pyx   |   5 +-
 .../pipeline/_parser_internals/stateclass.pyx |   3 +-
 .../_parser_internals/transition_system.pyx   |   4 +-
 spacy/pipeline/attribute_ruler.py             |   2 +-
 spacy/pipeline/dep_parser.py                  |  12 +-
 spacy/pipeline/edit_tree_lemmatizer.py        |  10 +-
 spacy/pipeline/entity_linker.py               |  30 +-
 spacy/pipeline/morphologizer.pyx              |  22 +-
 spacy/pipeline/ner.py                         |  21 +-
 spacy/pipeline/pipe.pyx                       |   4 +-
 spacy/pipeline/sentencizer.pyx                |   4 +-
 spacy/pipeline/senter.pyx                     |  12 +-
 spacy/pipeline/span_ruler.py                  |  10 +-
 spacy/pipeline/spancat.py                     |  20 +-
 spacy/pipeline/tagger.pyx                     |  22 +-
 spacy/pipeline/textcat.py                     |   6 +-
 spacy/pipeline/textcat_multilabel.py          |   6 +-
 spacy/pipeline/tok2vec.py                     |   6 +-
 spacy/pipeline/trainable_pipe.pyx             |  13 +-
 spacy/pipeline/transition_parser.pyx          |  52 ++-
 spacy/schemas.py                              |  51 +--
 spacy/strings.pxd                             |   5 +-
 spacy/strings.pyi                             |   3 +-
 spacy/strings.pyx                             |   6 +-
 spacy/tests/conftest.py                       |   8 +-
 spacy/tests/doc/test_span.py                  |   1 -
 spacy/tests/doc/test_underscore.py            |   1 +
 spacy/tests/parser/_search.pyx                |   7 +-
 spacy/tests/parser/test_ner.py                |   3 +-
 spacy/tests/parser/test_parse.py              |  12 +-
 .../pipeline/test_edit_tree_lemmatizer.py     |   3 +-
 spacy/tests/pipeline/test_entity_linker.py    |   2 +-
 spacy/tests/pipeline/test_entity_ruler.py     |   8 +-
 spacy/tests/pipeline/test_initialize.py       |   7 +-
 spacy/tests/pipeline/test_morphologizer.py    |   3 +-
 spacy/tests/pipeline/test_pipe_factories.py   |   2 +
 spacy/tests/pipeline/test_senter.py           |   1 +
 spacy/tests/pipeline/test_spancat.py          |   7 +-
 spacy/tests/pipeline/test_tagger.py           |   3 +-
 spacy/tests/pipeline/test_textcat.py          |  18 +-
 .../tests/serialize/test_serialize_config.py  |  25 +-
 .../serialize/test_serialize_pipeline.py      |  11 +-
 spacy/tests/test_cli.py                       |  20 +-
 spacy/tests/test_cli_app.py                   |   2 +-
 spacy/tests/test_language.py                  |  16 +-
 spacy/tests/test_misc.py                      |  20 +-
 spacy/tests/test_symbols.py                   |   1 +
 spacy/tests/training/test_loop.py             |   4 +-
 spacy/tests/training/test_training.py         |  15 +-
 spacy/tokenizer.pxd                           |   5 -
 spacy/tokenizer.pyx                           |   8 +-
 spacy/tokens/__init__.py                      |   4 +-
 spacy/tokens/doc.pyi                          |  12 +-
 spacy/tokens/doc.pyx                          |  23 +-
 spacy/tokens/doc_bin.py                       |   4 +-
 spacy/tokens/graph.pyx                        |   6 +-
 spacy/tokens/morphanalysis.pxd                |   7 +-
 spacy/tokens/morphanalysis.pyx                |  10 +-
 spacy/tokens/retokenizer.pyx                  |  10 +-
 spacy/tokens/span.pxd                         |   2 +-
 spacy/tokens/span.pyx                         |  13 +-
 spacy/tokens/span_group.pyx                   |   7 +-
 spacy/tokens/token.pyx                        |   4 +-
 spacy/training/__init__.py                    |  29 +-
 spacy/training/align.pyx                      |   1 -
 spacy/training/batchers.py                    |  13 +
 spacy/training/callbacks.py                   |   6 +-
 spacy/training/converters/json_to_docs.py     |  12 +-
 spacy/training/example.pyx                    |   1 +
 spacy/training/gold_io.pyx                    |   4 +-
 spacy/training/initialize.py                  |  29 +-
 spacy/training/loop.py                        |  34 +-
 spacy/ty.py                                   |  16 +-
 spacy/util.py                                 |  27 +-
 spacy/vectors.pyx                             |  12 +-
 spacy/vocab.pyx                               |   3 +
 121 files changed, 2016 insertions(+), 602 deletions(-)

diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 3095778fe22..b2612f57720 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -16,7 +16,6 @@
 from .debug_model import debug_model  # noqa: F401
 from .download import download  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
-from .find_function import find_function  # noqa: F401
 from .find_threshold import find_threshold  # noqa: F401
 from .info import info  # noqa: F401
 from .init_config import fill_config, init_config  # noqa: F401
@@ -24,17 +23,15 @@
 from .package import package  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .profile import profile  # noqa: F401
-from .project.assets import project_assets  # type: ignore[attr-defined]  # noqa: F401
-from .project.clone import project_clone  # type: ignore[attr-defined]  # noqa: F401
-from .project.document import (  # type: ignore[attr-defined]  # noqa: F401
-    project_document,
-)
-from .project.dvc import project_update_dvc  # type: ignore[attr-defined]  # noqa: F401
-from .project.pull import project_pull  # type: ignore[attr-defined]  # noqa: F401
-from .project.push import project_push  # type: ignore[attr-defined]  # noqa: F401
-from .project.run import project_run  # type: ignore[attr-defined]  # noqa: F401
-from .train import train_cli  # type: ignore[attr-defined]  # noqa: F401
-from .validate import validate  # type: ignore[attr-defined]  # noqa: F401
+from .project.assets import project_assets  # noqa: F401
+from .project.clone import project_clone  # noqa: F401
+from .project.document import project_document  # noqa: F401
+from .project.dvc import project_update_dvc  # noqa: F401
+from .project.pull import project_pull  # noqa: F401
+from .project.push import project_push  # noqa: F401
+from .project.run import project_run  # noqa: F401
+from .train import train_cli  # noqa: F401
+from .validate import validate  # noqa: F401
 
 
 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 52a70cc7320..b005accf91f 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,10 +1,3 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
-from typing import TYPE_CHECKING, overload
-import sys
-import shutil
-from pathlib import Path
-from wasabi import msg, Printer
-import srsly
 import hashlib
 import os
 import shutil
@@ -18,6 +11,7 @@
     Dict,
     Iterable,
     List,
+    Literal,
     Optional,
     Tuple,
     Union,
@@ -32,15 +26,10 @@
 from thinc.util import gpu_is_available
 from typer.main import get_command
 from wasabi import Printer, msg
-from weasel import app as project_cli
 
-from ..schemas import ProjectConfigSchema, validate
-from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
-from ..errors import RENAMED_LANGUAGE_CODES
 from .. import about
-from ..compat import Literal
-from ..schemas import validate
+from ..errors import RENAMED_LANGUAGE_CODES
+from ..schemas import ProjectConfigSchema, validate
 from ..util import (
     ENV_VARS,
     SimpleFrozenDict,
@@ -52,6 +41,10 @@
     run_command,
 )
 
+if TYPE_CHECKING:
+    from pathy import FluidPath  # noqa: F401
+
+
 SDIST_SUFFIX = ".tar.gz"
 WHEEL_SUFFIX = "-py3-none-any.whl"
 
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 3844b340678..a282e59c749 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -8,8 +8,6 @@
 import srsly
 from wasabi import Printer
 
-from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
-from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training import docs_to_json
 from ..training.converters import (
@@ -18,7 +16,7 @@
     iob_to_docs,
     json_to_docs,
 )
-from ._util import Arg, Opt, app, walk_directory
+from ._util import Arg, Opt, _handle_renamed_language_codes, app, walk_directory
 
 # Converters are matched by file extension except for ner/iob, which are
 # matched by file extension and content. To add a converter, add a new
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index c2253b0cb70..4c44a8c0e2b 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,11 +1,3 @@
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
-from typing import Literal, cast, overload
-from pathlib import Path
-from collections import Counter
-import sys
-import srsly
-from wasabi import Printer, MESSAGES, msg
-import typer
 import math
 import sys
 from collections import Counter
@@ -15,6 +7,7 @@
     Dict,
     Iterable,
     List,
+    Literal,
     Optional,
     Sequence,
     Set,
@@ -30,7 +23,6 @@
 from wasabi import MESSAGES, Printer, msg
 
 from .. import util
-from ..compat import Literal
 from ..language import Language
 from ..morphology import Morphology
 from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index ea7a06c5417..0635522930b 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,9 +7,15 @@
 from wasabi import msg
 
 from .. import about
-from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version, get_installed_models
-from ..util import get_package_version
+from ..util import (
+    get_installed_models,
+    get_minor_version,
+    get_package_version,
+    is_package,
+    is_prerelease_version,
+    run_command,
+)
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
 
 
 @app.command(
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 8bfc6b54f15..7a891547e0a 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -1,3 +1,4 @@
+import importlib.metadata
 import json
 import platform
 from pathlib import Path
@@ -7,7 +8,6 @@
 from wasabi import MarkdownRenderer, Printer
 
 from .. import about, util
-from ..compat import importlib_metadata
 from ._util import Arg, Opt, app, string_to_list
 from .download import get_latest_version, get_model_filename
 
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index b29a2b748f2..ca0c316ca20 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -12,9 +12,16 @@
 from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
-from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
-from ._util import string_to_list, import_code, _handle_renamed_language_codes
-
+from ._util import (
+    COMMAND,
+    Arg,
+    Opt,
+    _handle_renamed_language_codes,
+    import_code,
+    init_cli,
+    show_validation_error,
+    string_to_list,
+)
 
 ROOT = Path(__file__).parent / "templates"
 TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 1a044dedbc9..991dc1a822c 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -8,8 +8,17 @@
 
 from .. import util
 from ..language import Language
-from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu, _handle_renamed_language_codes
+from ..training.initialize import convert_vectors, init_nlp
+from ._util import (
+    Arg,
+    Opt,
+    _handle_renamed_language_codes,
+    import_code,
+    init_cli,
+    parse_config_overrides,
+    setup_gpu,
+    show_validation_error,
+)
 
 
 @init_cli.command("vectors")
diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
index 591d1959e73..aa270598621 100644
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@@ -1 +1,217 @@
-from weasel.cli.assets import *
+import os
+import re
+import shutil
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import requests
+import typer
+from wasabi import msg
+
+from ...util import ensure_path, working_dir
+from .._util import (
+    PROJECT_FILE,
+    Arg,
+    Opt,
+    SimpleFrozenDict,
+    download_file,
+    get_checksum,
+    get_git_version,
+    git_checkout,
+    load_project_config,
+    parse_config_overrides,
+    project_cli,
+)
+
+# Whether assets are extra if `extra` is not set.
+EXTRA_DEFAULT = False
+
+
+@project_cli.command(
+    "assets",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def project_assets_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
+    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
+    extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
+    # fmt: on
+):
+    """Fetch project assets like datasets and pretrained weights. Assets are
+    defined in the "assets" section of the project.yml. If a checksum is
+    provided in the project.yml, the file is only downloaded if no local file
+    with the same checksum exists.
+
+    DOCS: https://spacy.io/api/cli#project-assets
+    """
+    overrides = parse_config_overrides(ctx.args)
+    project_assets(
+        project_dir,
+        overrides=overrides,
+        sparse_checkout=sparse_checkout,
+        extra=extra,
+    )
+
+
+def project_assets(
+    project_dir: Path,
+    *,
+    overrides: Dict[str, Any] = SimpleFrozenDict(),
+    sparse_checkout: bool = False,
+    extra: bool = False,
+) -> None:
+    """Fetch assets for a project using DVC if possible.
+
+    project_dir (Path): Path to project directory.
+    sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
+                            needed.
+    extra (bool): Whether to download all assets, including those marked as 'extra'.
+    """
+    project_path = ensure_path(project_dir)
+    config = load_project_config(project_path, overrides=overrides)
+    assets = [
+        asset
+        for asset in config.get("assets", [])
+        if extra or not asset.get("extra", EXTRA_DEFAULT)
+    ]
+    if not assets:
+        msg.warn(
+            f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
+            exits=0,
+        )
+    msg.info(f"Fetching {len(assets)} asset(s)")
+
+    for asset in assets:
+        dest = (project_dir / asset["dest"]).resolve()
+        checksum = asset.get("checksum")
+        if "git" in asset:
+            git_err = (
+                f"Cloning spaCy project templates requires Git and the 'git' command. "
+                f"Make sure it's installed and that the executable is available."
+            )
+            get_git_version(error=git_err)
+            if dest.exists():
+                # If there's already a file, check for checksum
+                if checksum and checksum == get_checksum(dest):
+                    msg.good(
+                        f"Skipping download with matching checksum: {asset['dest']}"
+                    )
+                    continue
+                else:
+                    if dest.is_dir():
+                        shutil.rmtree(dest)
+                    else:
+                        dest.unlink()
+            if "repo" not in asset["git"] or asset["git"]["repo"] is None:
+                msg.fail(
+                    "A git asset must include 'repo', the repository address.", exits=1
+                )
+            if "path" not in asset["git"] or asset["git"]["path"] is None:
+                msg.fail(
+                    "A git asset must include 'path' - use \"\" to get the entire repository.",
+                    exits=1,
+                )
+            git_checkout(
+                asset["git"]["repo"],
+                asset["git"]["path"],
+                dest,
+                branch=asset["git"].get("branch"),
+                sparse=sparse_checkout,
+            )
+            msg.good(f"Downloaded asset {dest}")
+        else:
+            url = asset.get("url")
+            if not url:
+                # project.yml defines asset without URL that the user has to place
+                check_private_asset(dest, checksum)
+                continue
+            fetch_asset(project_path, url, dest, checksum)
+
+
+def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
+    """Check and validate assets without a URL (private assets that the user
+    has to provide themselves) and give feedback about the checksum.
+
+    dest (Path): Destination path of the asset.
+    checksum (Optional[str]): Optional checksum of the expected file.
+    """
+    if not Path(dest).exists():
+        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
+        msg.warn(err)
+    else:
+        if not checksum:
+            msg.good(f"Asset already exists: {dest}")
+        elif checksum == get_checksum(dest):
+            msg.good(f"Asset exists with matching checksum: {dest}")
+        else:
+            msg.fail(f"Asset available but with incorrect checksum: {dest}")
+
+
+def fetch_asset(
+    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
+) -> None:
+    """Fetch an asset from a given URL or path. If a checksum is provided and a
+    local file exists, it's only re-downloaded if the checksum doesn't match.
+
+    project_path (Path): Path to project directory.
+    url (str): URL or path to asset.
+    checksum (Optional[str]): Optional expected checksum of local file.
+    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
+        the asset failed.
+    """
+    dest_path = (project_path / dest).resolve()
+    if dest_path.exists():
+        # If there's already a file, check for checksum
+        if checksum:
+            if checksum == get_checksum(dest_path):
+                msg.good(f"Skipping download with matching checksum: {dest}")
+                return
+        else:
+            # If there's not a checksum, make sure the file is a possibly valid size
+            if os.path.getsize(dest_path) == 0:
+                msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
+                os.remove(dest_path)
+    # We might as well support the user here and create parent directories in
+    # case the asset dir isn't listed as a dir to create in the project.yml
+    if not dest_path.parent.exists():
+        dest_path.parent.mkdir(parents=True)
+    with working_dir(project_path):
+        url = convert_asset_url(url)
+        try:
+            download_file(url, dest_path)
+            msg.good(f"Downloaded asset {dest}")
+        except requests.exceptions.RequestException as e:
+            if Path(url).exists() and Path(url).is_file():
+                # If it's a local file, copy to destination
+                shutil.copy(url, str(dest_path))
+                msg.good(f"Copied local asset {dest}")
+            else:
+                msg.fail(f"Download failed: {dest}", e)
+    if checksum and checksum != get_checksum(dest_path):
+        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
+
+
+def convert_asset_url(url: str) -> str:
+    """Check and convert the asset URL if needed.
+
+    url (str): The asset URL.
+    RETURNS (str): The converted URL.
+    """
+    # If the asset URL is a regular GitHub URL it's likely a mistake
+    if (
+        re.match(r"(http(s?)):\/\/github.com", url)
+        and "releases/download" not in url
+        and "/raw/" not in url
+    ):
+        converted = url.replace("github.com", "raw.githubusercontent.com")
+        converted = re.sub(r"/(tree|blob)/", "/", converted)
+        msg.warn(
+            "Downloading from a regular GitHub URL. This will only download "
+            "the source of the page, not the actual file. Converting the URL "
+            "to a raw URL.",
+            converted,
+        )
+        return converted
+    return url
diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py
index 11d2511a361..2ee27c92adb 100644
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@@ -1 +1,124 @@
-from weasel.cli.clone import *
+import re
+import subprocess
+from pathlib import Path
+from typing import Optional
+
+from wasabi import msg
+
+from ... import about
+from ...util import ensure_path
+from .._util import (
+    COMMAND,
+    PROJECT_FILE,
+    Arg,
+    Opt,
+    get_git_version,
+    git_checkout,
+    git_repo_branch_exists,
+    project_cli,
+)
+
+DEFAULT_REPO = about.__projects__
+DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
+DEFAULT_BRANCHES = ["main", "master"]
+
+
+@project_cli.command("clone")
+def project_clone_cli(
+    # fmt: off
+    name: str = Arg(..., help="The name of the template to clone"),
+    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
+    repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
+    branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
+    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
+    # fmt: on
+):
+    """Clone a project template from a repository. Calls into "git" and will
+    only download the files from the given subdirectory. The GitHub repo
+    defaults to the official spaCy template repo, but can be customized
+    (including using a private repo).
+
+    DOCS: https://spacy.io/api/cli#project-clone
+    """
+    if dest is None:
+        dest = Path.cwd() / Path(name).parts[-1]
+    if repo == DEFAULT_REPO and branch is None:
+        branch = DEFAULT_PROJECTS_BRANCH
+
+    if branch is None:
+        for default_branch in DEFAULT_BRANCHES:
+            if git_repo_branch_exists(repo, default_branch):
+                branch = default_branch
+                break
+        if branch is None:
+            default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
+            msg.fail(
+                "No branch provided and attempted default "
+                f"branches {default_branches_msg} do not exist.",
+                exits=1,
+            )
+    else:
+        if not git_repo_branch_exists(repo, branch):
+            msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
+    assert isinstance(branch, str)
+    project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
+
+
+def project_clone(
+    name: str,
+    dest: Path,
+    *,
+    repo: str = about.__projects__,
+    branch: str = about.__projects_branch__,
+    sparse_checkout: bool = False,
+) -> None:
+    """Clone a project template from a repository.
+
+    name (str): Name of subdirectory to clone.
+    dest (Path): Destination path of cloned project.
+    repo (str): URL of Git repo containing project templates.
+    branch (str): The branch to clone from
+    """
+    dest = ensure_path(dest)
+    check_clone(name, dest, repo)
+    project_dir = dest.resolve()
+    repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
+    try:
+        git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
+    except subprocess.CalledProcessError:
+        err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
+        msg.fail(err, exits=1)
+    msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
+    if not (project_dir / PROJECT_FILE).exists():
+        msg.warn(f"No {PROJECT_FILE} found in directory")
+    else:
+        msg.good(f"Your project is now ready!")
+        print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
+
+
+def check_clone(name: str, dest: Path, repo: str) -> None:
+    """Check and validate that the destination path can be used to clone. Will
+    check that Git is available and that the destination path is suitable.
+
+    name (str): Name of the directory to clone from the repo.
+    dest (Path): Local destination of cloned directory.
+    repo (str): URL of the repo to clone from.
+    """
+    git_err = (
+        f"Cloning spaCy project templates requires Git and the 'git' command. "
+        f"To clone a project without Git, copy the files from the '{name}' "
+        f"directory in the {repo} to {dest} manually."
+    )
+    get_git_version(error=git_err)
+    if not dest:
+        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
+    if dest.exists():
+        # Directory already exists (not allowed, clone needs to create it)
+        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
+    if not dest.parent.exists():
+        # We're not creating parents, parent dir should exist
+        msg.fail(
+            f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
+            f"Create the necessary folder(s) first before continuing.",
+            exits=1,
+        )
diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py
index 1952524a933..80107d27acf 100644
--- a/spacy/cli/project/document.py
+++ b/spacy/cli/project/document.py
@@ -1 +1,115 @@
-from weasel.cli.document import *
+from pathlib import Path
+
+from wasabi import MarkdownRenderer, msg
+
+from ...util import working_dir
+from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
+
+DOCS_URL = "https://spacy.io"
+INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
+project, as well as the available commands and workflows. For details, see the
+[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
+INTRO_COMMANDS = f"""The following commands are defined by the project. They
+can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
+Commands are only re-run if their inputs have changed."""
+INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
+can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
+and will run the specified commands in order. Commands are only re-run if their
+inputs have changed."""
+INTRO_ASSETS = f"""The following assets are defined by the project. They can
+be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
+in the project directory."""
+# These markers are added to the Markdown and can be used to update the file in
+# place if it already exists. Only the auto-generated part will be replaced.
+MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
+MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
+# If this marker is used in an existing README, it's ignored and not replaced
+MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
+
+
+@project_cli.command("document")
+def project_document_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
+    output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
+    no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
+    # fmt: on
+):
+    """
+    Auto-generate a README.md for a project. If the content is saved to a file,
+    hidden markers are added so you can add custom content before or after the
+    auto-generated section and only the auto-generated docs will be replaced
+    when you re-run the command.
+
+    DOCS: https://spacy.io/api/cli#project-document
+    """
+    project_document(project_dir, output_file, no_emoji=no_emoji)
+
+
+def project_document(
+    project_dir: Path, output_file: Path, *, no_emoji: bool = False
+) -> None:
+    is_stdout = str(output_file) == "-"
+    config = load_project_config(project_dir)
+    md = MarkdownRenderer(no_emoji=no_emoji)
+    md.add(MARKER_START)
+    title = config.get("title")
+    description = config.get("description")
+    md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
+    if description:
+        md.add(description)
+    md.add(md.title(2, PROJECT_FILE, "📋"))
+    md.add(INTRO_PROJECT)
+    # Commands
+    cmds = config.get("commands", [])
+    data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
+    if data:
+        md.add(md.title(3, "Commands", "⏯"))
+        md.add(INTRO_COMMANDS)
+        md.add(md.table(data, ["Command", "Description"]))
+    # Workflows
+    wfs = config.get("workflows", {}).items()
+    data = [(md.code(n), " &rarr; ".join(md.code(w) for w in stp)) for n, stp in wfs]
+    if data:
+        md.add(md.title(3, "Workflows", "⏭"))
+        md.add(INTRO_WORKFLOWS)
+        md.add(md.table(data, ["Workflow", "Steps"]))
+    # Assets
+    assets = config.get("assets", [])
+    data = []
+    for a in assets:
+        source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
+        dest_path = a["dest"]
+        dest = md.code(dest_path)
+        if source == "Local":
+            # Only link assets if they're in the repo
+            with working_dir(project_dir) as p:
+                if (p / dest_path).exists():
+                    dest = md.link(dest, dest_path)
+        data.append((dest, source, a.get("description", "")))
+    if data:
+        md.add(md.title(3, "Assets", "🗂"))
+        md.add(INTRO_ASSETS)
+        md.add(md.table(data, ["File", "Source", "Description"]))
+    md.add(MARKER_END)
+    # Output result
+    if is_stdout:
+        print(md.text)
+    else:
+        content = md.text
+        if output_file.exists():
+            with output_file.open("r", encoding="utf8") as f:
+                existing = f.read()
+            if MARKER_IGNORE in existing:
+                msg.warn("Found ignore marker in existing file: skipping", output_file)
+                return
+            if MARKER_START in existing and MARKER_END in existing:
+                msg.info("Found existing file: only replacing auto-generated docs")
+                before = existing.split(MARKER_START)[0]
+                after = existing.split(MARKER_END)[1]
+                content = f"{before}{content}{after}"
+            else:
+                msg.warn("Replacing existing file")
+        with output_file.open("w", encoding="utf8") as f:
+            f.write(content)
+        msg.good("Saved project documentation", output_file)
diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py
index aa1ae7dd9ed..9ad55c43302 100644
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@@ -1 +1,220 @@
-from weasel.cli.dvc import *
+"""This module contains helpers and subcommands for integrating spaCy projects
+with Data Version Controk (DVC). https://dvc.org"""
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+
+from wasabi import msg
+
+from ...util import (
+    SimpleFrozenList,
+    join_command,
+    run_command,
+    split_command,
+    working_dir,
+)
+from .._util import (
+    COMMAND,
+    NAME,
+    PROJECT_FILE,
+    Arg,
+    Opt,
+    get_hash,
+    load_project_config,
+    project_cli,
+)
+
+DVC_CONFIG = "dvc.yaml"
+DVC_DIR = ".dvc"
+UPDATE_COMMAND = "dvc"
+DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
+# edited your {PROJECT_FILE}, you can regenerate this file by running:
+# {COMMAND} project {UPDATE_COMMAND}"""
+
+
+@project_cli.command(UPDATE_COMMAND)
+def project_update_dvc_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
+    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
+    quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
+    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
+    # fmt: on
+):
+    """Auto-generate Data Version Control (DVC) config. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. If no workflow is specified, the first defined
+    workflow is used. The DVC config will only be updated if the project.yml
+    changed.
+
+    DOCS: https://spacy.io/api/cli#project-dvc
+    """
+    project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
+
+
+def project_update_dvc(
+    project_dir: Path,
+    workflow: Optional[str] = None,
+    *,
+    verbose: bool = False,
+    quiet: bool = False,
+    force: bool = False,
+) -> None:
+    """Update the auto-generated Data Version Control (DVC) config file. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. Will only update the file if the checksum changed.
+
+    project_dir (Path): The project directory.
+    workflow (Optional[str]): Optional name of workflow defined in project.yml.
+        If not set, the first workflow will be used.
+    verbose (bool): Print more info.
+    quiet (bool): Print less info.
+    force (bool): Force update DVC config.
+    """
+    config = load_project_config(project_dir)
+    updated = update_dvc_config(
+        project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
+    )
+    help_msg = "To execute the workflow with DVC, run: dvc repro"
+    if updated:
+        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
+    else:
+        msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
+
+
+def update_dvc_config(
+    path: Path,
+    config: Dict[str, Any],
+    workflow: Optional[str] = None,
+    verbose: bool = False,
+    quiet: bool = False,
+    force: bool = False,
+) -> bool:
+    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
+    project directory. The file is auto-generated based on the config. The
+    first line of the auto-generated file specifies the hash of the config
+    dict, so if any of the config values change, the DVC config is regenerated.
+
+    path (Path): The path to the project directory.
+    config (Dict[str, Any]): The loaded project.yml.
+    verbose (bool): Whether to print additional info (via DVC).
+    quiet (bool): Don't output anything (via DVC).
+    force (bool): Force update, even if hashes match.
+    RETURNS (bool): Whether the DVC config file was updated.
+    """
+    ensure_dvc(path)
+    workflows = config.get("workflows", {})
+    workflow_names = list(workflows.keys())
+    check_workflows(workflow_names, workflow)
+    if not workflow:
+        workflow = workflow_names[0]
+    config_hash = get_hash(config)
+    path = path.resolve()
+    dvc_config_path = path / DVC_CONFIG
+    if dvc_config_path.exists():
+        # Check if the file was generated using the current config, if not, redo
+        with dvc_config_path.open("r", encoding="utf8") as f:
+            ref_hash = f.readline().strip().replace("# ", "")
+        if ref_hash == config_hash and not force:
+            return False  # Nothing has changed in project.yml, don't need to update
+        dvc_config_path.unlink()
+    dvc_commands = []
+    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+
+    # some flags that apply to every command
+    flags = []
+    if verbose:
+        flags.append("--verbose")
+    if quiet:
+        flags.append("--quiet")
+
+    for name in workflows[workflow]:
+        command = config_commands[name]
+        deps = command.get("deps", [])
+        outputs = command.get("outputs", [])
+        outputs_no_cache = command.get("outputs_no_cache", [])
+        if not deps and not outputs and not outputs_no_cache:
+            continue
+        # Default to the working dir as the project path since dvc.yaml is auto-generated
+        # and we don't want arbitrary paths in there
+        project_cmd = ["python", "-m", NAME, "project", "run", name]
+        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
+        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
+        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
+
+        dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
+        if command.get("no_skip"):
+            dvc_cmd.append("--always-changed")
+        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
+        dvc_commands.append(join_command(full_cmd))
+
+    if not dvc_commands:
+        # If we don't check for this, then there will be an error when reading the
+        # config, since DVC wouldn't create it.
+        msg.fail(
+            "No usable commands for DVC found. This can happen if none of your "
+            "commands have dependencies or outputs.",
+            exits=1,
+        )
+
+    with working_dir(path):
+        for c in dvc_commands:
+            dvc_command = "dvc " + c
+            run_command(dvc_command)
+    with dvc_config_path.open("r+", encoding="utf8") as f:
+        content = f.read()
+        f.seek(0, 0)
+        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
+    return True
+
+
+def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
+    """Validate workflows provided in project.yml and check that a given
+    workflow can be used to generate a DVC config.
+
+    workflows (List[str]): Names of the available workflows.
+    workflow (Optional[str]): The name of the workflow to convert.
+    """
+    if not workflows:
+        msg.fail(
+            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
+            f"define at least one list of commands.",
+            exits=1,
+        )
+    if workflow is not None and workflow not in workflows:
+        msg.fail(
+            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
+            f"Available workflows: {', '.join(workflows)}",
+            exits=1,
+        )
+    if not workflow:
+        msg.warn(
+            f"No workflow specified for DVC pipeline. Using the first workflow "
+            f"defined in {PROJECT_FILE}: '{workflows[0]}'"
+        )
+
+
+def ensure_dvc(project_dir: Path) -> None:
+    """Ensure that the "dvc" command is available and that the current project
+    directory is an initialized DVC project.
+    """
+    try:
+        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
+    except Exception:
+        msg.fail(
+            "To use spaCy projects with DVC (Data Version Control), DVC needs "
+            "to be installed and the 'dvc' command needs to be available",
+            "You can install the Python package from pip (pip install dvc) or "
+            "conda (conda install -c conda-forge dvc). For more details, see the "
+            "documentation: https://dvc.org/doc/install",
+            exits=1,
+        )
+    if not (project_dir / ".dvc").exists():
+        msg.fail(
+            "Project not initialized as a DVC project",
+            "To initialize a DVC project, you can run 'dvc init' in the project "
+            "directory. For more details, see the documentation: "
+            "https://dvc.org/doc/command-reference/init",
+            exits=1,
+        )
diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index 5e603273d94..e9be74df7f4 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -1 +1,67 @@
-from weasel.cli.pull import *
+from pathlib import Path
+
+from wasabi import msg
+
+from .._util import Arg, load_project_config, logger, project_cli
+from .remote_storage import RemoteStorage, get_command_hash
+from .run import update_lockfile
+
+
+@project_cli.command("pull")
+def project_pull_cli(
+    # fmt: off
+    remote: str = Arg("default", help="Name or path of remote storage"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    # fmt: on
+):
+    """Retrieve available precomputed outputs from a remote storage.
+    You can alias remotes in your project.yml by mapping them to storage paths.
+    A storage can be anything that the smart-open library can upload to, e.g.
+    AWS, Google Cloud Storage, SSH, local directories etc.
+
+    DOCS: https://spacy.io/api/cli#project-pull
+    """
+    for url, output_path in project_pull(project_dir, remote):
+        if url is not None:
+            msg.good(f"Pulled {output_path} from {url}")
+
+
+def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
+    # TODO: We don't have tests for this :(. It would take a bit of mockery to
+    # set up. I guess see if it breaks first?
+    config = load_project_config(project_dir)
+    if remote in config.get("remotes", {}):
+        remote = config["remotes"][remote]
+    storage = RemoteStorage(project_dir, remote)
+    commands = list(config.get("commands", []))
+    # We use a while loop here because we don't know how the commands
+    # will be ordered. A command might need dependencies from one that's later
+    # in the list.
+    while commands:
+        for i, cmd in enumerate(list(commands)):
+            logger.debug("CMD: %s.", cmd["name"])
+            deps = [project_dir / dep for dep in cmd.get("deps", [])]
+            if all(dep.exists() for dep in deps):
+                cmd_hash = get_command_hash("", "", deps, cmd["script"])
+                for output_path in cmd.get("outputs", []):
+                    url = storage.pull(output_path, command_hash=cmd_hash)
+                    logger.debug(
+                        "URL: %s for %s with command hash %s",
+                        url,
+                        output_path,
+                        cmd_hash,
+                    )
+                    yield url, output_path
+
+                out_locs = [project_dir / out for out in cmd.get("outputs", [])]
+                if all(loc.exists() for loc in out_locs):
+                    update_lockfile(project_dir, cmd)
+                # We remove the command from the list here, and break, so that
+                # we iterate over the loop again.
+                commands.pop(i)
+                break
+            else:
+                logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
+        else:
+            # If we didn't break the for loop, break the while loop.
+            break
diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py
index 3a8e8869db1..a7915e54741 100644
--- a/spacy/cli/project/push.py
+++ b/spacy/cli/project/push.py
@@ -1 +1,69 @@
-from weasel.cli.push import *
+from pathlib import Path
+
+from wasabi import msg
+
+from .._util import Arg, load_project_config, logger, project_cli
+from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
+
+
+@project_cli.command("push")
+def project_push_cli(
+    # fmt: off
+    remote: str = Arg("default", help="Name or path of remote storage"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    # fmt: on
+):
+    """Persist outputs to a remote storage. You can alias remotes in your
+    project.yml by mapping them to storage paths. A storage can be anything that
+    the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
+    local directories etc.
+
+    DOCS: https://spacy.io/api/cli#project-push
+    """
+    for output_path, url in project_push(project_dir, remote):
+        if url is None:
+            msg.info(f"Skipping {output_path}")
+        else:
+            msg.good(f"Pushed {output_path} to {url}")
+
+
+def project_push(project_dir: Path, remote: str):
+    """Persist outputs to a remote storage. You can alias remotes in your project.yml
+    by mapping them to storage paths. A storage can be anything that the smart-open
+    library can upload to, e.g. gcs, aws, ssh, local directories etc
+    """
+    config = load_project_config(project_dir)
+    if remote in config.get("remotes", {}):
+        remote = config["remotes"][remote]
+    storage = RemoteStorage(project_dir, remote)
+    for cmd in config.get("commands", []):
+        logger.debug("CMD: %s", cmd["name"])
+        deps = [project_dir / dep for dep in cmd.get("deps", [])]
+        if any(not dep.exists() for dep in deps):
+            logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
+            continue
+        cmd_hash = get_command_hash(
+            "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
+        )
+        logger.debug("CMD_HASH: %s", cmd_hash)
+        for output_path in cmd.get("outputs", []):
+            output_loc = project_dir / output_path
+            if output_loc.exists() and _is_not_empty_dir(output_loc):
+                url = storage.push(
+                    output_path,
+                    command_hash=cmd_hash,
+                    content_hash=get_content_hash(output_loc),
+                )
+                logger.debug(
+                    "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
+                )
+                yield output_path, url
+
+
+def _is_not_empty_dir(loc: Path):
+    if not loc.is_dir():
+        return True
+    elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
+        return True
+    else:
+        return False
diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py
index 29409150fad..84235a90d39 100644
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@@ -1 +1,212 @@
-from weasel.cli.remote_storage import *
+import hashlib
+import os
+import site
+import tarfile
+import urllib.parse
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, List, Optional
+
+from wasabi import msg
+
+from ... import about
+from ...errors import Errors
+from ...git_info import GIT_VERSION
+from ...util import ENV_VARS, check_bool_env_var, get_minor_version
+from .._util import (
+    download_file,
+    ensure_pathy,
+    get_checksum,
+    get_hash,
+    make_tempdir,
+    upload_file,
+)
+
+if TYPE_CHECKING:
+    from pathy import FluidPath  # noqa: F401
+
+
+class RemoteStorage:
+    """Push and pull outputs to and from a remote file storage.
+
+    Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
+    ssh, etc.
+    """
+
+    def __init__(self, project_root: Path, url: str, *, compression="gz"):
+        self.root = project_root
+        self.url = ensure_pathy(url)
+        self.compression = compression
+
+    def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
+        """Compress a file or directory within a project and upload it to a remote
+        storage. If an object exists at the full URL, nothing is done.
+
+        Within the remote storage, files are addressed by their project path
+        (url encoded) and two user-supplied hashes, representing their creation
+        context and their file contents. If the URL already exists, the data is
+        not uploaded. Paths are archived and compressed prior to upload.
+        """
+        loc = self.root / path
+        if not loc.exists():
+            raise IOError(f"Cannot push {loc}: does not exist.")
+        url = self.make_url(path, command_hash, content_hash)
+        if url.exists():
+            return url
+        tmp: Path
+        with make_tempdir() as tmp:
+            tar_loc = tmp / self.encode_name(str(path))
+            mode_string = f"w:{self.compression}" if self.compression else "w"
+            with tarfile.open(tar_loc, mode=mode_string) as tar_file:
+                tar_file.add(str(loc), arcname=str(path))
+            upload_file(tar_loc, url)
+        return url
+
+    def pull(
+        self,
+        path: Path,
+        *,
+        command_hash: Optional[str] = None,
+        content_hash: Optional[str] = None,
+    ) -> Optional["FluidPath"]:
+        """Retrieve a file from the remote cache. If the file already exists,
+        nothing is done.
+
+        If the command_hash and/or content_hash are specified, only matching
+        results are returned. If no results are available, an error is raised.
+        """
+        dest = self.root / path
+        if dest.exists():
+            return None
+        url = self.find(path, command_hash=command_hash, content_hash=content_hash)
+        if url is None:
+            return url
+        else:
+            # Make sure the destination exists
+            if not dest.parent.exists():
+                dest.parent.mkdir(parents=True)
+            tmp: Path
+            with make_tempdir() as tmp:
+                tar_loc = tmp / url.parts[-1]
+                download_file(url, tar_loc)
+                mode_string = f"r:{self.compression}" if self.compression else "r"
+                with tarfile.open(tar_loc, mode=mode_string) as tar_file:
+                    # This requires that the path is added correctly, relative
+                    # to root. This is how we set things up in push()
+
+                    # Disallow paths outside the current directory for the tar
+                    # file (CVE-2007-4559, directory traversal vulnerability)
+                    def is_within_directory(directory, target):
+                        abs_directory = os.path.abspath(directory)
+                        abs_target = os.path.abspath(target)
+                        prefix = os.path.commonprefix([abs_directory, abs_target])
+                        return prefix == abs_directory
+
+                    def safe_extract(tar, path):
+                        for member in tar.getmembers():
+                            member_path = os.path.join(path, member.name)
+                            if not is_within_directory(path, member_path):
+                                raise ValueError(Errors.E852)
+                        tar.extractall(path)
+
+                    safe_extract(tar_file, self.root)
+        return url
+
+    def find(
+        self,
+        path: Path,
+        *,
+        command_hash: Optional[str] = None,
+        content_hash: Optional[str] = None,
+    ) -> Optional["FluidPath"]:
+        """Find the best matching version of a file within the storage,
+        or `None` if no match can be found. If both the creation and content hash
+        are specified, only exact matches will be returned. Otherwise, the most
+        recent matching file is preferred.
+        """
+        name = self.encode_name(str(path))
+        urls = []
+        if command_hash is not None and content_hash is not None:
+            url = self.url / name / command_hash / content_hash
+            urls = [url] if url.exists() else []
+        elif command_hash is not None:
+            if (self.url / name / command_hash).exists():
+                urls = list((self.url / name / command_hash).iterdir())
+        else:
+            if (self.url / name).exists():
+                for sub_dir in (self.url / name).iterdir():
+                    urls.extend(sub_dir.iterdir())
+                if content_hash is not None:
+                    urls = [url for url in urls if url.parts[-1] == content_hash]
+        if len(urls) >= 2:
+            try:
+                urls.sort(key=lambda x: x.stat().last_modified)  # type: ignore
+            except Exception:
+                msg.warn(
+                    "Unable to sort remote files by last modified. The file(s) "
+                    "pulled from the cache may not be the most recent."
+                )
+        return urls[-1] if urls else None
+
+    def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
+        """Construct a URL from a subpath, a creation hash and a content hash."""
+        return self.url / self.encode_name(str(path)) / command_hash / content_hash
+
+    def encode_name(self, name: str) -> str:
+        """Encode a subpath into a URL-safe name."""
+        return urllib.parse.quote_plus(name)
+
+
+def get_content_hash(loc: Path) -> str:
+    return get_checksum(loc)
+
+
+def get_command_hash(
+    site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
+) -> str:
+    """Create a hash representing the execution of a command. This includes the
+    currently installed packages, whatever environment variables have been marked
+    as relevant, and the command.
+    """
+    if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
+        spacy_v = GIT_VERSION
+    else:
+        spacy_v = str(get_minor_version(about.__version__) or "")
+    dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
+    hashes = [spacy_v, site_hash, env_hash] + dep_checksums
+    hashes.extend(cmd)
+    creation_bytes = "".join(hashes).encode("utf8")
+    return hashlib.md5(creation_bytes).hexdigest()
+
+
+def get_site_hash():
+    """Hash the current Python environment's site-packages contents, including
+    the name and version of the libraries. The list we're hashing is what
+    `pip freeze` would output.
+    """
+    site_dirs = site.getsitepackages()
+    if site.ENABLE_USER_SITE:
+        site_dirs.extend(site.getusersitepackages())
+    packages = set()
+    for site_dir in site_dirs:
+        site_dir = Path(site_dir)
+        for subpath in site_dir.iterdir():
+            if subpath.parts[-1].endswith("dist-info"):
+                packages.add(subpath.parts[-1].replace(".dist-info", ""))
+    package_bytes = "".join(sorted(packages)).encode("utf8")
+    return hashlib.md5sum(package_bytes).hexdigest()
+
+
+def get_env_hash(env: Dict[str, str]) -> str:
+    """Construct a hash of the environment variables that will be passed into
+    the commands.
+
+    Values in the env dict may be references to the current os.environ, using
+    the syntax $ENV_VAR to mean os.environ[ENV_VAR]
+    """
+    env_vars = {}
+    for key, value in env.items():
+        if value.startswith("$"):
+            env_vars[key] = os.environ.get(value[1:], "")
+        else:
+            env_vars[key] = value
+    return get_hash(env_vars)
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index cc6a5ac4256..43972a2026a 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -1 +1,379 @@
-from weasel.cli.run import *
+import os.path
+import sys
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
+
+import srsly
+import typer
+from wasabi import msg
+from wasabi.util import locale_escape
+
+from ... import about
+from ...git_info import GIT_VERSION
+from ...util import (
+    ENV_VARS,
+    SimpleFrozenDict,
+    SimpleFrozenList,
+    check_bool_env_var,
+    is_cwd,
+    is_minor_version_match,
+    join_command,
+    run_command,
+    split_command,
+    working_dir,
+)
+from .._util import (
+    COMMAND,
+    PROJECT_FILE,
+    PROJECT_LOCK,
+    Arg,
+    Opt,
+    get_checksum,
+    get_hash,
+    load_project_config,
+    parse_config_overrides,
+    project_cli,
+)
+
+
+@project_cli.command(
+    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
+)
+def project_run_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
+    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
+    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
+    # fmt: on
+):
+    """Run a named command or workflow defined in the project.yml. If a workflow
+    name is specified, all commands in the workflow are run, in order. If
+    commands define dependencies and/or outputs, they will only be re-run if
+    state has changed.
+
+    DOCS: https://spacy.io/api/cli#project-run
+    """
+    if show_help or not subcommand:
+        print_run_help(project_dir, subcommand)
+    else:
+        overrides = parse_config_overrides(ctx.args)
+        project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
+
+
+def project_run(
+    project_dir: Path,
+    subcommand: str,
+    *,
+    overrides: Dict[str, Any] = SimpleFrozenDict(),
+    force: bool = False,
+    dry: bool = False,
+    capture: bool = False,
+    skip_requirements_check: bool = False,
+) -> None:
+    """Run a named script defined in the project.yml. If the script is part
+    of the default pipeline (defined in the "run" section), DVC is used to
+    execute the command, so it can determine whether to rerun it. It then
+    calls into "exec" to execute it.
+
+    project_dir (Path): Path to project directory.
+    subcommand (str): Name of command to run.
+    overrides (Dict[str, Any]): Optional config overrides.
+    force (bool): Force re-running, even if nothing changed.
+    dry (bool): Perform a dry run and don't execute commands.
+    capture (bool): Whether to capture the output and errors of individual commands.
+        If False, the stdout and stderr will not be redirected, and if there's an error,
+        sys.exit will be called with the return code. You should use capture=False
+        when you want to turn over execution to the command, and capture=True
+        when you want to run the command more like a function.
+    skip_requirements_check (bool): Whether to skip the requirements check.
+    """
+    config = load_project_config(project_dir, overrides=overrides)
+    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+    workflows = config.get("workflows", {})
+    validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
+
+    req_path = project_dir / "requirements.txt"
+    if not skip_requirements_check:
+        if config.get("check_requirements", True) and os.path.exists(req_path):
+            with req_path.open() as requirements_file:
+                _check_requirements([req.strip() for req in requirements_file])
+
+    if subcommand in workflows:
+        msg.info(f"Running workflow '{subcommand}'")
+        for cmd in workflows[subcommand]:
+            project_run(
+                project_dir,
+                cmd,
+                overrides=overrides,
+                force=force,
+                dry=dry,
+                capture=capture,
+                skip_requirements_check=True,
+            )
+    else:
+        cmd = commands[subcommand]
+        for dep in cmd.get("deps", []):
+            if not (project_dir / dep).exists():
+                err = f"Missing dependency specified by command '{subcommand}': {dep}"
+                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
+                err_exits = 1 if not dry else None
+                msg.fail(err, err_help, exits=err_exits)
+        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
+        with working_dir(project_dir) as current_dir:
+            msg.divider(subcommand)
+            rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
+            if not rerun and not force:
+                msg.info(f"Skipping '{cmd['name']}': nothing changed")
+            else:
+                run_commands(cmd["script"], dry=dry, capture=capture)
+                if not dry:
+                    update_lockfile(current_dir, cmd)
+
+
+def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
+    """Simulate a CLI help prompt using the info available in the project.yml.
+
+    project_dir (Path): The project directory.
+    subcommand (Optional[str]): The subcommand or None. If a subcommand is
+        provided, the subcommand help is shown. Otherwise, the top-level help
+        and a list of available commands is printed.
+    """
+    config = load_project_config(project_dir)
+    config_commands = config.get("commands", [])
+    commands = {cmd["name"]: cmd for cmd in config_commands}
+    workflows = config.get("workflows", {})
+    project_loc = "" if is_cwd(project_dir) else project_dir
+    if subcommand:
+        validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
+        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
+        if subcommand in commands:
+            help_text = commands[subcommand].get("help")
+            if help_text:
+                print(f"\n{help_text}\n")
+        elif subcommand in workflows:
+            steps = workflows[subcommand]
+            print(f"\nWorkflow consisting of {len(steps)} commands:")
+            steps_data = [
+                (f"{i + 1}. {step}", commands[step].get("help", ""))
+                for i, step in enumerate(steps)
+            ]
+            msg.table(steps_data)
+            help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
+            print(f"For command details, run: {help_cmd}")
+    else:
+        print("")
+        title = config.get("title")
+        if title:
+            print(f"{locale_escape(title)}\n")
+        if config_commands:
+            print(f"Available commands in {PROJECT_FILE}")
+            print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
+            msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
+        if workflows:
+            print(f"Available workflows in {PROJECT_FILE}")
+            print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
+            msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
+
+
+def run_commands(
+    commands: Iterable[str] = SimpleFrozenList(),
+    silent: bool = False,
+    dry: bool = False,
+    capture: bool = False,
+) -> None:
+    """Run a sequence of commands in a subprocess, in order.
+
+    commands (List[str]): The string commands.
+    silent (bool): Don't print the commands.
+    dry (bool): Perform a dry run and don't execut anything.
+    capture (bool): Whether to capture the output and errors of individual commands.
+        If False, the stdout and stderr will not be redirected, and if there's an error,
+        sys.exit will be called with the return code. You should use capture=False
+        when you want to turn over execution to the command, and capture=True
+        when you want to run the command more like a function.
+    """
+    for c in commands:
+        command = split_command(c)
+        # Not sure if this is needed or a good idea. Motivation: users may often
+        # use commands in their config that reference "python" and we want to
+        # make sure that it's always executing the same Python that spaCy is
+        # executed with and the pip in the same env, not some other Python/pip.
+        # Also ensures cross-compatibility if user 1 writes "python3" (because
+        # that's how it's set up on their system), and user 2 without the
+        # shortcut tries to re-run the command.
+        if len(command) and command[0] in ("python", "python3"):
+            command[0] = sys.executable
+        elif len(command) and command[0] in ("pip", "pip3"):
+            command = [sys.executable, "-m", "pip", *command[1:]]
+        if not silent:
+            print(f"Running command: {join_command(command)}")
+        if not dry:
+            run_command(command, capture=capture)
+
+
+def validate_subcommand(
+    commands: Sequence[str], workflows: Sequence[str], subcommand: str
+) -> None:
+    """Check that a subcommand is valid and defined. Raises an error otherwise.
+
+    commands (Sequence[str]): The available commands.
+    subcommand (str): The subcommand.
+    """
+    if not commands and not workflows:
+        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
+    if subcommand not in commands and subcommand not in workflows:
+        help_msg = []
+        if subcommand in ["assets", "asset"]:
+            help_msg.append("Did you mean to run: python -m spacy project assets?")
+        if commands:
+            help_msg.append(f"Available commands: {', '.join(commands)}")
+        if workflows:
+            help_msg.append(f"Available workflows: {', '.join(workflows)}")
+        msg.fail(
+            f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
+            ". ".join(help_msg),
+            exits=1,
+        )
+
+
+def check_rerun(
+    project_dir: Path,
+    command: Dict[str, Any],
+    *,
+    check_spacy_version: bool = True,
+    check_spacy_commit: bool = False,
+) -> bool:
+    """Check if a command should be rerun because its settings or inputs/outputs
+    changed.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    strict_version (bool):
+    RETURNS (bool): Whether to re-run the command.
+    """
+    # Always rerun if no-skip is set
+    if command.get("no_skip", False):
+        return True
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():  # We don't have a lockfile, run command
+        return True
+    data = srsly.read_yaml(lock_path)
+    if command["name"] not in data:  # We don't have info about this command
+        return True
+    entry = data[command["name"]]
+    # Always run commands with no outputs (otherwise they'd always be skipped)
+    if not entry.get("outs", []):
+        return True
+    # Always rerun if spaCy version or commit hash changed
+    spacy_v = entry.get("spacy_version")
+    commit = entry.get("spacy_git_version")
+    if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
+        info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
+        msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
+        return True
+    if check_spacy_commit and commit != GIT_VERSION:
+        info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
+        msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
+        return True
+    # If the entry in the lockfile matches the lockfile entry that would be
+    # generated from the current command, we don't rerun because it means that
+    # all inputs/outputs, hashes and scripts are the same and nothing changed
+    lock_entry = get_lock_entry(project_dir, command)
+    exclude = ["spacy_version", "spacy_git_version"]
+    return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
+
+
+def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
+    """Update the lockfile after running a command. Will create a lockfile if
+    it doesn't yet exist and will add an entry for the current command, its
+    script and dependencies/outputs.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    """
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():
+        srsly.write_yaml(lock_path, {})
+        data = {}
+    else:
+        data = srsly.read_yaml(lock_path)
+    data[command["name"]] = get_lock_entry(project_dir, command)
+    srsly.write_yaml(lock_path, data)
+
+
+def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
+    """Get a lockfile entry for a given command. An entry includes the command,
+    the script (command steps) and a list of dependencies and outputs with
+    their paths and file hashes, if available. The format is based on the
+    dvc.lock files, to keep things consistent.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    RETURNS (Dict[str, Any]): The lockfile entry.
+    """
+    deps = get_fileinfo(project_dir, command.get("deps", []))
+    outs = get_fileinfo(project_dir, command.get("outputs", []))
+    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
+    return {
+        "cmd": f"{COMMAND} run {command['name']}",
+        "script": command["script"],
+        "deps": deps,
+        "outs": [*outs, *outs_nc],
+        "spacy_version": about.__version__,
+        "spacy_git_version": GIT_VERSION,
+    }
+
+
+def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
+    """Generate the file information for a list of paths (dependencies, outputs).
+    Includes the file path and the file's checksum.
+
+    project_dir (Path): The current project directory.
+    paths (List[str]): The file paths.
+    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
+    """
+    data = []
+    for path in paths:
+        file_path = project_dir / path
+        md5 = get_checksum(file_path) if file_path.exists() else None
+        data.append({"path": path, "md5": md5})
+    return data
+
+
+def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
+    """Checks whether requirements are installed and free of version conflicts.
+    requirements (List[str]): List of requirements.
+    RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
+        exist.
+    """
+    import pkg_resources
+
+    failed_pkgs_msgs: List[str] = []
+    conflicting_pkgs_msgs: List[str] = []
+
+    for req in requirements:
+        try:
+            pkg_resources.require(req)
+        except pkg_resources.DistributionNotFound as dnf:
+            failed_pkgs_msgs.append(dnf.report())
+        except pkg_resources.VersionConflict as vc:
+            conflicting_pkgs_msgs.append(vc.report())
+        except Exception:
+            msg.warn(
+                f"Unable to check requirement: {req} "
+                "Checks are currently limited to requirement specifiers "
+                "(PEP 508)"
+            )
+
+    if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
+        msg.warn(
+            title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
+            "correctly and you installed all requirements specified in your project's requirements.txt: "
+        )
+        for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
+            msg.text(pgk_msg)
+
+    return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 40b9986e85b..b7f689bcb3e 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -1,3 +1,4 @@
+import itertools
 import uuid
 from typing import Any, Dict, List, Optional, Tuple, Union
 
diff --git a/spacy/errors.py b/spacy/errors.py
index fe067f7915d..4909371d549 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,5 +1,5 @@
-from typing import Literal
 import warnings
+from typing import Literal
 
 
 class ErrorsWithCodes(type):
diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
index 7155c15df9a..2aa084ef52a 100644
--- a/spacy/kb/__init__.py
+++ b/spacy/kb/__init__.py
@@ -1,6 +1,5 @@
-from .candidate import Candidate, get_candidates, get_candidates_batch
+from .candidate import Candidate, InMemoryCandidate
 from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB
-from .candidate import Candidate, InMemoryCandidate
 
 __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd
index f21f423e496..4419ed47666 100644
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@@ -1,6 +1,8 @@
 from libcpp.vector cimport vector
-from .kb_in_memory cimport InMemoryLookupKB
+
 from ..typedefs cimport hash_t
+from .kb_in_memory cimport InMemoryLookupKB
+
 
 cdef class Candidate:
     pass
diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx
index bf66ccfae67..1739cfa64f6 100644
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@@ -1,6 +1,7 @@
 # cython: infer_types=True
 
 from .kb_in_memory cimport InMemoryLookupKB
+
 from ..errors import Errors
 
 
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index bb58bf88a46..c3479eabc18 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -5,7 +5,7 @@ from typing import Iterable, Tuple, Union
 
 from cymem.cymem cimport Pool
 
-from .candidate import Candidate
+from ..errors import Errors
 from ..tokens import Span, SpanGroup
 from ..util import SimpleFrozenList
 from .candidate import Candidate
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index 3aab0d73e72..fee407e68b2 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -1,5 +1,5 @@
-# cython: infer_types=True
-from typing import Any, Callable, Dict, Iterable
+# cython: infer_types=True, profile=True
+from typing import Any, Callable, Dict, Iterable, Union
 
 import srsly
 
@@ -22,6 +22,7 @@ from ..util import SimpleFrozenList, ensure_path
 
 from ..vocab cimport Vocab
 from .kb cimport KnowledgeBase
+
 from .candidate import InMemoryCandidate
 
 
diff --git a/spacy/language.py b/spacy/language.py
index 028f733200e..ea641224684 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,12 +1,4 @@
-from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Literal
-from typing import Union, Tuple, List, Set, Pattern, Sequence
-from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
-
-from dataclasses import dataclass
-import random
-import itertools
 import functools
-import inspect
 import itertools
 import multiprocessing as mp
 import random
@@ -25,6 +17,7 @@
     Iterable,
     Iterator,
     List,
+    Literal,
     NoReturn,
     Optional,
     Pattern,
@@ -37,29 +30,41 @@
     overload,
 )
 
-from . import ty
-from .tokens.underscore import Underscore
-from .vocab import Vocab, create_vocab
-from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .training import Example, validate_examples, validate_distillation_examples
-from .training.initialize import init_vocab, init_tok2vec
-from .scorer import Scorer
-from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
-from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
-from .util import warn_if_jupyter_cupy
-from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
-from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
-from .lang.punctuation import TOKENIZER_INFIXES
-from .tokens import Doc
-from .tokenizer import Tokenizer
+import srsly
+from thinc.api import Config, CupyOps, Optimizer, get_current_ops
+
+from . import about, ty, util
 from .errors import Errors, Warnings
-from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
-from .schemas import ConfigSchemaPretrain, validate_init_settings
 from .git_info import GIT_VERSION
-from . import util
-from . import about
+from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .lang.tokenizer_exceptions import BASE_EXCEPTIONS, URL_MATCH
 from .lookups import load_lookups
-
+from .pipe_analysis import analyze_pipes, print_pipe_analysis, validate_attrs
+from .schemas import (
+    ConfigSchema,
+    ConfigSchemaInit,
+    ConfigSchemaNlp,
+    ConfigSchemaPretrain,
+    validate_init_settings,
+)
+from .scorer import Scorer
+from .tokenizer import Tokenizer
+from .tokens import Doc
+from .tokens.underscore import Underscore
+from .training import Example, validate_distillation_examples, validate_examples
+from .training.initialize import init_tok2vec, init_vocab
+from .util import (
+    _DEFAULT_EMPTY_PIPES,
+    CONFIG_SECTION_ORDER,
+    SimpleFrozenDict,
+    SimpleFrozenList,
+    _pipe,
+    combine_score_weights,
+    raise_error,
+    registry,
+    warn_if_jupyter_cupy,
+)
+from .vocab import Vocab, create_vocab
 
 PipeCallable = Callable[[Doc], Doc]
 
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 2d14edcd6b0..ff51d77e8a9 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,10 +1,19 @@
 from numpy cimport ndarray
 
-from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
-from .attrs cimport attr_id_t
-from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
-
+from .attrs cimport (
+    ID,
+    LANG,
+    LENGTH,
+    LOWER,
+    NORM,
+    ORTH,
+    PREFIX,
+    SHAPE,
+    SUFFIX,
+    attr_id_t,
+)
 from .structs cimport LexemeC
+from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
 from .vocab cimport Vocab
 
 
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 41fc8f1d2b1..92ef3b16259 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -2,6 +2,7 @@
 # cython: profile=False
 # Compiler crashes on memory view coercion without this. Should report bug.
 cimport numpy as np
+from cython.view cimport array as cvarray
 from libc.string cimport memset
 
 np.import_array()
@@ -35,7 +36,7 @@ from .typedefs cimport attr_t, flags_t
 from .attrs import intify_attrs
 from .errors import Errors, Warnings
 
-OOV_RANK = 0xffffffffffffffff  # UINT64_MAX
+OOV_RANK = 0xffffffffffffffff # UINT64_MAX
 memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 EMPTY_LEXEME.id = OOV_RANK
 
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 0b639ab04fb..60299603623 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True
+# cython: infer_types=True, profile=True
 import warnings
 from collections import defaultdict
 from itertools import product
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index a0b6d91e7d5..fe2d8bec3bc 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -1,6 +1,17 @@
-from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Literal
-from typing import Iterator, Iterable, overload
-from ..vocab import Vocab
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+    overload,
+)
+
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 7e734ac247e..8accd8c4465 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -1,4 +1,4 @@
-# cython: binding=True, infer_types=True
+# cython: binding=True, infer_types=True, profile=True
 from typing import Iterable, List
 
 from cymem.cymem cimport Pool
@@ -12,23 +12,35 @@ import warnings
 
 import srsly
 
-from ..attrs cimport DEP, ENT_IOB, ID, LEMMA, MORPH, NULL_ATTR, POS, TAG
+from ..attrs cimport (
+    DEP,
+    ENT_IOB,
+    ID,
+    LEMMA,
+    MORPH,
+    NULL_ATTR,
+    ORTH,
+    POS,
+    TAG,
+    attr_id_t,
+)
 from ..structs cimport TokenC
 from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.morphanalysis cimport MorphAnalysis
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
+from ..vocab cimport Vocab
 
-from ..schemas import validate_token_pattern
-from ..errors import Errors, MatchPatternError, Warnings
-from ..strings cimport get_string_id
-from ..attrs import IDS
 from ..errors import Errors, MatchPatternError, Warnings
 from ..schemas import validate_token_pattern
-from ..strings import get_string_id
 from .levenshtein import levenshtein_compare
 
+from ..strings cimport get_string_id
+
+from ..attrs import IDS
+from ..util import registry
+
 DEF PADDING = 5
 
 
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index 45685db228a..d3c679a65d5 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -1,7 +1,5 @@
-from typing import List, Tuple, Union, Optional, Callable, Any, Dict, Literal
-from typing import overload
-from .matcher import Matcher
-from ..vocab import Vocab
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, overload
+
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 from .matcher import Matcher
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 6e3c52924fa..107d7d926ee 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,15 +1,17 @@
 # cython: infer_types=True, profile=True
-from typing import List
 from collections import defaultdict
+from typing import List
+
 from libc.stdint cimport uintptr_t
-from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
+from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
 
 import warnings
 
-from ..attrs cimport DEP, LEMMA, MORPH, POS, TAG
+from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG
 
 from ..attrs import IDS
 
+from ..structs cimport TokenC
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index db960fbd0a9..987eb6733d3 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -14,21 +14,9 @@
 )
 from thinc.types import Floats2d
 
-from ...util import registry
-from ...kb import KnowledgeBase, InMemoryLookupKB
-from ...kb import Candidate
-from ...vocab import Vocab
-from ...tokens import Doc, Span, SpanGroup
-from ..extract_spans import extract_spans
 from ...errors import Errors
-from ...kb import (
-    Candidate,
-    InMemoryLookupKB,
-    KnowledgeBase,
-    get_candidates,
-    get_candidates_batch,
-)
-from ...tokens import Doc, Span
+from ...kb import Candidate, InMemoryLookupKB, KnowledgeBase
+from ...tokens import Doc, Span, SpanGroup
 from ...util import registry
 from ...vocab import Vocab
 from ..extract_spans import extract_spans
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 01312983d86..422abf4e260 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,12 +1,13 @@
-from typing import Optional, List, Tuple, Any, Literal
-from thinc.types import Floats2d
-from thinc.api import Model
 import warnings
+from typing import Any, List, Literal, Optional, Tuple
+
+from thinc.api import Model
+from thinc.types import Floats2d
 
 from ...errors import Errors, Warnings
+from ...tokens.doc import Doc
 from ...util import registry
 from ..tb_framework import TransitionModel
-from ...tokens.doc import Doc
 
 TransitionSystem = Any  # TODO
 State = Any  # TODO
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index a605d32cd40..61bc7291e2e 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -22,8 +22,6 @@
 from ...attrs import intify_attr
 from ...errors import Errors
 from ...ml import character_embed
-from ..staticvectors import StaticVectors
-from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
 from ...tokens import Doc
 from ...util import registry
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 1a1b0a0fffd..3b9a9ce2dd1 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -1,4 +1,3 @@
-import warnings
 from typing import Callable, List, Optional, Sequence, Tuple, cast
 
 from thinc.api import Model, Ops, registry
@@ -6,10 +5,9 @@
 from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 from thinc.util import partial
 
-from ..attrs import ORTH
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..tokens import Doc
-from ..vectors import Mode, Vectors
+from ..vectors import Mode
 from ..vocab import Vocab
 
 
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 9b2114900d3..fd0af12ceab 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -1,28 +1,45 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import List, Tuple, Any, Optional, TypeVar, cast
-from libc.string cimport memset, memcpy
+from typing import Any, List, Optional, Tuple, TypeVar, cast
+
 from libc.stdlib cimport calloc, free, realloc
+from libc.string cimport memcpy, memset
 from libcpp.vector cimport vector
+
 import numpy
+
 cimport numpy as np
-from thinc.api import Model, normal_init, chain, list2array, Linear
-from thinc.api import uniform_init, glorot_uniform_init, zero_init
-from thinc.api import NumpyOps
+
+from thinc.api import (
+    Linear,
+    Model,
+    NumpyOps,
+    chain,
+    glorot_uniform_init,
+    list2array,
+    normal_init,
+    uniform_init,
+    zero_init,
+)
+
 from thinc.backends.cblas cimport CBlas, saxpy, sgemm
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
-from thinc.types import Ints1d, Ints2d
+
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
 
 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
 from ..pipeline._parser_internals.batch import GreedyBatch
+
 from ..pipeline._parser_internals._parser_utils cimport arg_max
-from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
-from ..pipeline._parser_internals.transition_system cimport TransitionSystem
 from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..pipeline._parser_internals.transition_system cimport (
+    TransitionSystem,
+    c_apply_actions,
+    c_transition_batch,
+)
+
 from ..tokens.doc import Doc
 from ..util import registry
 
-
 State = Any  # TODO
 
 
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 494088879b1..5138d353cf0 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,8 +1,8 @@
 cimport numpy as np
 from libc.stdint cimport uint32_t, uint64_t
+from libcpp.memory cimport shared_ptr
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.memory cimport shared_ptr
 
 from .strings cimport StringStore
 from .structs cimport MorphAnalysisC
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 7ee621056f1..d75c1071941 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,14 +1,14 @@
 # cython: infer_types
-# cython: profile=False
 import warnings
-from typing import Union, Tuple, List, Dict, Optional
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy
+
 from cython.operator cimport dereference as deref
 from libcpp.memory cimport shared_ptr
 
-from .errors import Warnings
 from . import symbols
 from .errors import Warnings
-from .parts_of_speech import IDS as POS_IDS
 
 
 cdef class Morphology:
diff --git a/spacy/pipeline/_edit_tree_internals/schemas.py b/spacy/pipeline/_edit_tree_internals/schemas.py
index 89f2861ceac..1e307b66cb9 100644
--- a/spacy/pipeline/_edit_tree_internals/schemas.py
+++ b/spacy/pipeline/_edit_tree_internals/schemas.py
@@ -1,12 +1,8 @@
 from collections import defaultdict
 from typing import Any, Dict, List, Union
 
-try:
-    from pydantic.v1 import BaseModel, Field, ValidationError
-    from pydantic.v1.types import StrictBool, StrictInt, StrictStr
-except ImportError:
-    from pydantic import BaseModel, Field, ValidationError  # type: ignore
-    from pydantic.types import StrictBool, StrictInt, StrictStr  # type: ignore
+from pydantic import BaseModel, Field, ValidationError
+from pydantic.types import StrictBool, StrictInt, StrictStr
 
 
 class MatchNodeSchema(BaseModel):
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pxd b/spacy/pipeline/_parser_internals/_beam_utils.pxd
index 571f246b1e3..5a452e56a88 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd
@@ -1,5 +1,6 @@
 from ...typedefs cimport class_t, hash_t
 
+
 # These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
 
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index c86de231d09..7098b822ef0 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -1,14 +1,21 @@
 # cython: infer_types=True
+# cython: profile=True
+cimport numpy as np
+
 import numpy
-from cpython.ref cimport PyObject, Py_XDECREF
 
-from ...typedefs cimport class_t
+from cpython.ref cimport Py_XDECREF, PyObject
+
+from ...typedefs cimport class_t, hash_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
+
 from .batch cimport Batch
 from .search cimport Beam, MaxViolation
+
 from .search import MaxViolation
+
 from .stateclass cimport StateC, StateClass
 
 
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 1b6b25e5f16..1c61ac271d8 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,5 +1,4 @@
 cimport libcpp
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as incr
 from libc.stdint cimport uint32_t, uint64_t
@@ -8,7 +7,6 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 673e36bf5ac..08f60b2634b 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -1,4 +1,4 @@
-# cython: cdivision=True, infer_types=True
+# cython: profile=True, cdivision=True, infer_types=True
 from cymem.cymem cimport Address, Pool
 from libc.stdint cimport int32_t
 from libcpp.vector cimport vector
@@ -9,7 +9,7 @@ from ...strings cimport hash_string
 from ...structs cimport TokenC
 from ...tokens.doc cimport Doc, set_children_from_heads
 from ...tokens.token cimport MISSING_DEP
-from ...typedefs cimport attr_t
+from ...typedefs cimport attr_t, hash_t
 
 from ...training import split_bilu_label
 
@@ -18,6 +18,7 @@ from ._state cimport ArcC, StateC
 from .stateclass cimport StateClass
 
 from ...errors import Errors
+
 from .search cimport Beam
 
 
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index cf19c834ed9..5c31ff5c21d 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -1,10 +1,10 @@
 import os
 import random
+
+from cymem.cymem cimport Pool
 from libc.stdint cimport int32_t
 from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
-from cymem.cymem cimport Pool
-from libc.stdint cimport int32_t
 
 from collections import Counter
 
@@ -14,16 +14,15 @@ from ...tokens.span import Span
 
 from ...attrs cimport IS_SPACE
 from ...lexeme cimport Lexeme
-from ...structs cimport SpanC
+from ...structs cimport SpanC, TokenC
 from ...tokens.span cimport Span
 from ...typedefs cimport attr_t, weight_t
 
 from ...training import split_bilu_label
 
 from ...training.example cimport Example
-from .search cimport Beam
-from .stateclass cimport StateClass
 from ._state cimport StateC
+from .search cimport Beam
 from .stateclass cimport StateClass
 from .transition_system cimport Transition, do_func_t
 
diff --git a/spacy/pipeline/_parser_internals/search.pxd b/spacy/pipeline/_parser_internals/search.pxd
index dfe30e1c130..4626496335a 100644
--- a/spacy/pipeline/_parser_internals/search.pxd
+++ b/spacy/pipeline/_parser_internals/search.pxd
@@ -1,12 +1,10 @@
 from cymem.cymem cimport Pool
-
-from libc.stdint cimport uint32_t
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint32_t, uint64_t
 from libcpp.pair cimport pair
 from libcpp.queue cimport priority_queue
 from libcpp.vector cimport vector
 
-from ...typedefs cimport class_t, weight_t, hash_t
+from ...typedefs cimport class_t, hash_t, weight_t
 
 ctypedef pair[weight_t, size_t] Entry
 ctypedef priority_queue[Entry] Queue
diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
index 1d9b6dd7adf..251eaa805cb 100644
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -1,7 +1,8 @@
 # cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
-from libc.string cimport memset, memcpy
-from libc.math cimport log, exp
+from libc.math cimport exp, log
+from libc.string cimport memcpy, memset
+
 import math
 
 from cymem.cymem cimport Pool
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index e49ff63c48b..c2a0d22956a 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -1,5 +1,6 @@
 # cython: infer_types=True
-# cython: profile=False
+import numpy
+
 from libcpp.vector cimport vector
 
 from ...tokens.doc cimport Doc
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index d1340d68c62..a433ce7dc75 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -11,9 +11,11 @@ from collections import Counter
 import srsly
 
 from ...structs cimport TokenC
+from ...tokens.doc cimport Doc
 from ...typedefs cimport attr_t, weight_t
-from .stateclass cimport StateClass
+from . cimport _beam_utils
 from ._parser_utils cimport arg_max_if_valid
+from .stateclass cimport StateClass
 
 from ... import util
 from ...errors import Errors
diff --git a/spacy/pipeline/attribute_ruler.py b/spacy/pipeline/attribute_ruler.py
index 126a48945bc..76f82b84e38 100644
--- a/spacy/pipeline/attribute_ruler.py
+++ b/spacy/pipeline/attribute_ruler.py
@@ -11,7 +11,7 @@
 from ..symbols import IDS
 from ..tokens import Doc, Span
 from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs
-from ..vocab import Vocab
+from ..training import Example
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
 from .pipe import Pipe
diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.py
index 370a698c25a..b4961487b83 100644
--- a/spacy/pipeline/dep_parser.py
+++ b/spacy/pipeline/dep_parser.py
@@ -1,23 +1,19 @@
 # cython: infer_types=True, binding=True
 from collections import defaultdict
-from typing import Callable, Optional
+from typing import Callable, Iterable, Optional
 
 from thinc.api import Config, Model
 
-from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
-from ._parser_internals.arc_eager import ArcEager
-
-from ._parser_internals.arc_eager cimport ArcEager
-from .transition_parser cimport Parser
-
 from ..language import Language
 from ..scorer import Scorer
 from ..training import remove_bilu_prefix
 from ..util import registry
 from ._parser_internals import nonproj
+from ._parser_internals.arc_eager import ArcEager
 from ._parser_internals.nonproj import DELIMITER
+from ._parser_internals.transition_system import TransitionSystem
 from .functions import merge_subtokens
+from .transition_parser import Parser
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index a1bcb98455c..046ef19c3d5 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -1,12 +1,12 @@
 from collections import Counter
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model
-from thinc.types import ArrayXd, Floats2d, Ints1d
+from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
 from ..errors import Errors
@@ -19,10 +19,6 @@
 from .lemmatizer import lemmatizer_score
 from .trainable_pipe import TrainablePipe
 
-# The cutoff value of *top_k* above which an alternative method is used to process guesses.
-TOP_K_GUARDRAIL = 20
-
-
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
 
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 4882ead1d92..287f96d9b97 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,45 +1,27 @@
-import warnings
-from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast
-from numpy import dtype
-from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
-from pathlib import Path
-from itertools import islice
-import srsly
 import random
+import warnings
 from itertools import islice
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Union, cast
 
 import srsly
+from numpy import dtype
 from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate
-from thinc.types import Floats2d
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 
-from ..kb import KnowledgeBase, Candidate
-from ..tokens import Doc, Span
-from ..ml import empty_kb
-from ..tokens import Doc, Span, SpanGroup
-from .pipe import deserialize_config
-from .trainable_pipe import TrainablePipe
-from ..language import Language
-from ..vocab import Vocab
-from ..training import Example, validate_examples, validate_get_examples
-from ..errors import Errors, Warnings
-from ..util import SimpleFrozenList, registry
 from .. import util
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..kb import Candidate, KnowledgeBase
 from ..language import Language
 from ..ml import empty_kb
 from ..scorer import Scorer
-from ..tokens import Doc, Span
+from ..tokens import Doc, Span, SpanGroup
 from ..training import Example, validate_examples, validate_get_examples
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
-from .legacy.entity_linker import EntityLinker_v1
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 
-
 ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
 
 KNOWLEDGE_BASE_IDS = "kb_ids"
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 5e7d0720a40..7259fc02699 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,32 +1,30 @@
 # cython: infer_types=True, profile=True, binding=True
+from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Union
+
 import srsly
-from thinc.api import Model, Config
+from thinc.api import Config, Model
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
-from itertools import islice
-from typing import Callable, Dict, Optional, Union
-
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 
 from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 
-from ..parts_of_speech import IDS as POS_IDS
-from ..symbols import POS
-from ..language import Language
-from ..errors import Errors
-from .pipe import deserialize_config
-from .tagger import ActivationsT, Tagger
 from .. import util
 from ..errors import Errors
 from ..language import Language
 from ..parts_of_speech import IDS as POS_IDS
 from ..scorer import Scorer
+from ..symbols import POS
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import Tagger
+from .pipe import deserialize_config
+from .tagger import ActivationsT, Tagger
+
+# See #9050
+BACKWARD_OVERWRITE = True
+BACKWARD_EXTEND = False
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.py
index 2c5fd89cc5d..1c7cf151385 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.py
@@ -1,25 +1,16 @@
 # cython: infer_types=True, binding=True
 from collections import defaultdict
-from typing import Callable, Optional
+from typing import Callable, Iterable, Optional
 
 from thinc.api import Config, Model
 
-from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
-from ._parser_internals.ner import BiluoPushDown
-from ..language import Language
-from ..scorer import get_ner_prf, PRFScore
-from ..training import validate_examples
-from ..util import registry
-from ..training import remove_bilu_prefix
-
-from ._parser_internals.ner cimport BiluoPushDown
-from .transition_parser cimport Parser
-
 from ..language import Language
-from ..scorer import get_ner_prf
-from ..training import remove_bilu_prefix
+from ..scorer import PRFScore, get_ner_prf
+from ..training import remove_bilu_prefix, validate_examples
 from ..util import registry
+from ._parser_internals.ner import BiluoPushDown
+from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index af7cd09f171..7bc6735a802 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -1,6 +1,6 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
 import warnings
-from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
+from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union
 
 import srsly
 
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 02b92e87812..6dd62ed8577 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
 from typing import Callable, List, Optional
 
 import srsly
@@ -7,9 +7,11 @@ from ..tokens.doc cimport Doc
 
 from .. import util
 from ..language import Language
+from ..scorer import Scorer
 from .pipe import Pipe
 from .senter import senter_score
 
+
 @Language.factory(
     "sentencizer",
     assigns=["token.is_sent_start", "doc.sents"],
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index ba45df28400..42615e194e0 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,25 +1,21 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Dict, Iterable, Optional, Callable, List, Union
 from itertools import islice
-from typing import Callable, Optional
+from typing import Callable, Dict, Iterable, List, Optional, Union
 
 import srsly
-from thinc.api import Model, Config
+from thinc.api import Config, Model
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
-
 from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
 
-from .tagger import ActivationsT, Tagger
-from ..language import Language
+from .. import util
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import Tagger
-
+from .tagger import ActivationsT, Tagger
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index 4875c5e4bff..cd8fea36b47 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -17,20 +17,12 @@
 
 import srsly
 
-from .pipe import Pipe
-from ..training import Example
-from ..language import Language
-from ..errors import Errors, Warnings
-from ..util import ensure_path, SimpleFrozenList, registry
-from ..tokens import Doc, Span
-from ..scorer import Scorer, get_ner_prf
-from ..matcher import Matcher, PhraseMatcher
 from .. import util
 from ..errors import Errors, Warnings
 from ..language import Language
 from ..matcher import Matcher, PhraseMatcher
 from ..matcher.levenshtein import levenshtein_compare
-from ..scorer import Scorer
+from ..scorer import Scorer, get_ner_prf
 from ..tokens import Doc, Span
 from ..training import Example
 from ..util import SimpleFrozenList, ensure_path, registry
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 5c450f36a33..72fd78f461e 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,8 +1,18 @@
-from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
-from typing import Union, Protocol, runtime_checkable
-from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
-from thinc.api import Optimizer
-from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
+from dataclasses import dataclass
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Protocol,
+    Tuple,
+    Union,
+    cast,
+    runtime_checkable,
+)
 
 import numpy
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 8740058174a..f3d0527ea0b 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,29 +1,29 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Callable, Dict, Iterable, List, Optional, Union
-from typing import Tuple
-import numpy
-import srsly
-from thinc.api import Model, set_dropout_rate, Config
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
 import warnings
 from itertools import islice
-from typing import Callable, Optional
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
+import srsly
+from thinc.api import Config, Model, set_dropout_rate
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d, Ints1d
 
+from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
+from ..vocab cimport Vocab
 
 from .. import util
-from ..errors import Errors
+from ..attrs import ID, POS
+from ..errors import Errors, Warnings
 from ..language import Language
+from ..parts_of_speech import X
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
+from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 
-
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
 default_model_config = """
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 6cb33109891..13841dd7bbb 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,9 +1,5 @@
-from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
-from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
-from thinc.types import Floats2d
-import numpy
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy
 from thinc.api import Config, Model, Optimizer, get_array_module, set_dropout_rate
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 9ed9770086c..309b9a84443 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,9 +1,5 @@
-from typing import Iterable, Optional, Dict, List, Callable, Any, Union
-from thinc.types import Floats2d
-from thinc.api import Model, Config
-
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
 from thinc.api import Config, Model
 from thinc.types import Floats2d
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index f168aee2ec4..92aec22b7a7 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -1,10 +1,8 @@
-from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple
-from thinc.api import Model, set_dropout_rate, Optimizer, Config
-from thinc.types import Floats2d
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
 
 from thinc.api import Config, Model, Optimizer, set_dropout_rate
+from thinc.types import Floats2d
 
 from ..errors import Errors
 from ..language import Language
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 97442a1aa97..e7cf566a113 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -1,19 +1,16 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+import warnings
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 
 import srsly
-from thinc.api import set_dropout_rate, Model, Optimizer
-import warnings
+from thinc.api import Model, Optimizer, set_dropout_rate
 
 from ..tokens.doc cimport Doc
 
-from ..training import validate_examples, validate_distillation_examples
-from ..errors import Errors, Warnings
-from .pipe import Pipe, deserialize_config
 from .. import util
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..language import Language
-from ..training import Example, validate_examples
+from ..training import Example, validate_distillation_examples, validate_examples
 from ..vocab import Vocab
 from .pipe import Pipe, deserialize_config
 
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index ef2e3314e85..d521aeced7f 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,49 +1,61 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
+
 from typing import Dict, Iterable, List, Optional, Tuple
-from cymem.cymem cimport Pool
+
 cimport numpy as np
 from cymem.cymem cimport Pool
 
 from itertools import islice
 
 from libc.stdlib cimport calloc, free
-from libc.string cimport memset
+from libc.string cimport memcpy, memset
 from libcpp.vector cimport vector
 
-import random
 import contextlib
+import random
+import warnings
 
-import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
-from thinc.api import chain, softmax_activation, use_ops, get_array_module
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
-import numpy.random
 import numpy
 import numpy.random
 import srsly
-from thinc.api import CupyOps, NumpyOps, set_dropout_rate
+from thinc.api import (
+    CupyOps,
+    NumpyOps,
+    Optimizer,
+    chain,
+    get_array_module,
+    get_ops,
+    set_dropout_rate,
+    softmax_activation,
+    use_ops,
+)
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d, Ints1d
 
 from ..ml.tb_framework import TransitionModelInputs
-from ._parser_internals.stateclass cimport StateC, StateClass
-from ._parser_internals.search cimport Beam
+
 from ..tokens.doc cimport Doc
-from .trainable_pipe cimport TrainablePipe
 from ._parser_internals cimport _beam_utils
+from ._parser_internals.search cimport Beam
+from ._parser_internals.stateclass cimport StateC, StateClass
+from .trainable_pipe cimport TrainablePipe
+
 from ._parser_internals import _beam_utils
+
+from ..typedefs cimport weight_t
 from ..vocab cimport Vocab
 from ._parser_internals.transition_system cimport Transition, TransitionSystem
-from ..typedefs cimport weight_t
 
-from ..training import validate_examples, validate_get_examples
-from ..training import validate_distillation_examples
-from ..errors import Errors, Warnings
 from .. import util
-from ..errors import Errors
-from ..training import validate_examples, validate_get_examples
-from ._parser_internals import _beam_utils
+from ..errors import Errors, Warnings
+from ..training import (
+    validate_distillation_examples,
+    validate_examples,
+    validate_get_examples,
+)
+
 
 # TODO: Remove when we switch to Cython 3.
 cdef extern from "<algorithm>" namespace "std" nogil:
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 7fc5ec20e51..4372e3f5e2e 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,12 +1,3 @@
-from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
-from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
-from enum import Enum
-from pydantic import BaseModel, Field, ValidationError, validator, create_model
-from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
-from pydantic.main import ModelMetaclass
-from thinc.api import Optimizer, ConfigValidationError, Model
-from thinc.config import Promise
-from collections import defaultdict
 import inspect
 import re
 from collections import defaultdict
@@ -18,6 +9,7 @@
     Dict,
     Iterable,
     List,
+    Literal,
     Optional,
     Tuple,
     Type,
@@ -25,34 +17,19 @@
     Union,
 )
 
-try:
-    from pydantic.v1 import (
-        BaseModel,
-        ConstrainedStr,
-        Field,
-        StrictBool,
-        StrictFloat,
-        StrictInt,
-        StrictStr,
-        ValidationError,
-        create_model,
-        validator,
-    )
-    from pydantic.v1.main import ModelMetaclass
-except ImportError:
-    from pydantic import (  # type: ignore
-        BaseModel,
-        ConstrainedStr,
-        Field,
-        StrictBool,
-        StrictFloat,
-        StrictInt,
-        StrictStr,
-        ValidationError,
-        create_model,
-        validator,
-    )
-    from pydantic.main import ModelMetaclass  # type: ignore
+from pydantic import (
+    BaseModel,
+    ConstrainedStr,
+    Field,
+    StrictBool,
+    StrictFloat,
+    StrictInt,
+    StrictStr,
+    ValidationError,
+    create_model,
+    validator,
+)
+from pydantic.main import ModelMetaclass
 from thinc.api import ConfigValidationError, Model, Optimizer
 from thinc.config import Promise
 
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index b734a707c54..c05731c9a15 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,8 +1,5 @@
-from libc.stdint cimport int64_t, uint32_t
-from libcpp.vector cimport vector
-from libcpp.set cimport set
 from cymem.cymem cimport Pool
-from libc.stdint cimport int64_t
+from libc.stdint cimport int64_t, uint32_t
 from libcpp.set cimport set
 from libcpp.vector cimport vector
 from murmurhash.mrmr cimport hash64
diff --git a/spacy/strings.pyi b/spacy/strings.pyi
index 393661f591d..98224fcd449 100644
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@@ -1,6 +1,5 @@
-from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overload
 from pathlib import Path
-from typing import Any, Iterable, Iterator, Optional, Union, overload
+from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union, overload
 
 class StringStore:
     def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ...
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 73e4c46ed46..43826f07c44 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,7 +1,10 @@
 # cython: infer_types=True
-from typing import Optional, Union, Iterable, Tuple, Callable, Any, List, Iterator
+from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
+
 cimport cython
 from libc.stdint cimport uint32_t
+from libc.string cimport memcpy
+from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 import srsly
@@ -14,7 +17,6 @@ from .symbols import IDS as SYMBOLS_BY_STR
 from .symbols import NAMES as SYMBOLS_BY_INT
 
 
-
 cdef class StringStore:
     """Look up strings by 64-bit hashes. Implicitly handles reserved symbols.
 
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index fdc9f192c2f..28551f9ee63 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -1,11 +1,11 @@
-import pytest
-from spacy.util import get_lang_class
 import functools
-from hypothesis import settings
-import inspect
 import importlib
+import inspect
 import sys
 
+import pytest
+from hypothesis import settings
+
 from spacy.util import get_lang_class
 
 # Functionally disable deadline settings for tests
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 0b05ca7c123..cf850a2234d 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -6,7 +6,6 @@
 from spacy.attrs import LENGTH, ORTH
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span, SpanGroup, Token
-from spacy.vocab import Vocab
 from spacy.util import filter_spans
 from spacy.vocab import Vocab
 
diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index ca5c2ad3959..3ab7de76323 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -4,6 +4,7 @@
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
+
 # Helper functions
 def _get_tuple(s: Span):
     return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
index 23fc8164412..0983159b75d 100644
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@@ -1,11 +1,14 @@
 # cython: infer_types=True, binding=True
+from cymem.cymem cimport Pool
+
 from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
 from spacy.typedefs cimport class_t, weight_t
-from cymem.cymem cimport Pool
 
-from ..conftest import cytest
 import pytest
 
+from ..conftest import cytest
+
+
 cdef struct TestState:
     int length
     int x
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 2c520b7daf6..f0efc3a63fd 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -3,6 +3,7 @@
 
 import pytest
 from numpy.testing import assert_equal
+from thinc.api import fix_random_seed
 
 from spacy import registry, util
 from spacy.attrs import ENT_IOB
@@ -16,8 +17,6 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
-from thinc.api import fix_random_seed
-import logging
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 4c709932bb1..636bb887789 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,17 +1,19 @@
 import itertools
-import pytest
+
 import numpy
+import pytest
 from numpy.testing import assert_equal
-from thinc.api import Adam
+from thinc.api import Adam, fix_random_seed
 
 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.training import Example
+from spacy.pipeline import DependencyParser
+from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.tokens import Doc
+from spacy.training import Example
 from spacy.vocab import Vocab
-from spacy import util, registry
-from thinc.api import fix_random_seed
 
 from ..util import apply_transition_sequence, make_tempdir
 
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index 0f204ead477..7465c844492 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -1,5 +1,5 @@
-from typing import cast
 import pickle
+from typing import cast
 
 import hypothesis.strategies as st
 import pytest
@@ -10,7 +10,6 @@
 from spacy.language import Language
 from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
 from spacy.pipeline.trainable_pipe import TrainablePipe
-from spacy.training import Example
 from spacy.strings import StringStore
 from spacy.training import Example
 from spacy.util import make_tempdir
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 170f2215f83..fe7335600b4 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,4 +1,4 @@
-from typing import Callable, Iterable, Dict, Any, cast
+from typing import Any, Callable, Dict, Iterable, cast
 
 import pytest
 from numpy.testing import assert_equal
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 6bff3288dc3..520012c5075 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -2,16 +2,10 @@
 from thinc.api import NumpyOps, get_current_ops
 
 from spacy import registry
-from spacy.tokens import Doc, Span
-from spacy.language import Language
-from spacy.lang.en import English
-from spacy.pipeline import EntityRecognizer, merge_entities
-from spacy.pipeline import SpanRuler
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import EntityRecognizer, EntityRuler, SpanRuler, merge_entities
+from spacy.pipeline import EntityRecognizer, SpanRuler, merge_entities
 from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.tests.util import make_tempdir
 from spacy.tokens import Doc, Span
diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py
index 9854b391e60..6dd4114f1cd 100644
--- a/spacy/tests/pipeline/test_initialize.py
+++ b/spacy/tests/pipeline/test_initialize.py
@@ -1,10 +1,5 @@
 import pytest
-
-try:
-    from pydantic.v1 import StrictBool
-except ImportError:
-    from pydantic import StrictBool  # type: ignore
-
+from pydantic import StrictBool
 from thinc.api import ConfigValidationError
 
 from spacy.lang.en import English
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index fffb7b4ed7f..542d14d1516 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -1,4 +1,5 @@
 from typing import cast
+
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import get_current_ops
@@ -9,7 +10,7 @@
 from spacy.language import Language
 from spacy.morphology import Morphology
 from spacy.pipeline import TrainablePipe
-from spacy.attrs import MORPH
+from spacy.tests.util import make_tempdir
 from spacy.tokens import Doc
 from spacy.training import Example
 
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index c45dccb0624..9e1382ebd8c 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -1,4 +1,6 @@
 import pytest
+from pydantic import StrictInt, StrictStr
+from thinc.api import ConfigValidationError, Linear, Model
 
 try:
     from pydantic.v1 import StrictInt, StrictStr
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 94285178310..51f943898f1 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -1,4 +1,5 @@
 from typing import cast
+
 import pytest
 from numpy.testing import assert_equal
 
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 5dcc2e70f67..42eb90a1bb1 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -1,6 +1,7 @@
 import numpy
-from numpy.testing import assert_array_equal, assert_almost_equal
-from thinc.api import get_current_ops, Ragged, fix_random_seed
+import pytest
+from numpy.testing import assert_almost_equal, assert_array_equal
+from thinc.api import NumpyOps, Ragged, fix_random_seed, get_current_ops
 
 from spacy import util
 from spacy.lang.en import English
@@ -8,7 +9,7 @@
 from spacy.tokens import SpanGroup
 from spacy.tokens.span_groups import SpanGroups
 from spacy.training import Example
-from spacy.util import registry, make_tempdir
+from spacy.util import make_tempdir, registry
 
 OPS = get_current_ops()
 
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index b6f94f7f97b..05e814f0733 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,4 +1,5 @@
 from typing import cast
+
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import compounding, get_current_ops
@@ -8,7 +9,7 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
-from thinc.api import compounding
+from spacy.training import Example
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 2383c36bb01..3f2d757eebc 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,5 +1,5 @@
-from typing import cast
 import random
+from typing import cast
 
 import numpy.random
 import pytest
@@ -13,12 +13,16 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TextCategorizer, TrainablePipe
-from spacy.pipeline.textcat import single_label_bow_config
-from spacy.pipeline.textcat import single_label_cnn_config
-from spacy.pipeline.textcat import single_label_default_config
-from spacy.pipeline.textcat_multilabel import multi_label_bow_config
-from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
-from spacy.pipeline.textcat_multilabel import multi_label_default_config
+from spacy.pipeline.textcat import (
+    single_label_bow_config,
+    single_label_cnn_config,
+    single_label_default_config,
+)
+from spacy.pipeline.textcat_multilabel import (
+    multi_label_bow_config,
+    multi_label_cnn_config,
+    multi_label_default_config,
+)
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, DocBin
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index eb0dcc1e38c..646ce0f5d48 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -5,14 +5,25 @@
 import spacy
 from spacy.lang.de import German
 from spacy.lang.en import English
-from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
-from spacy.language import DEFAULT_CONFIG_DISTILL_PATH
-from spacy.language import Language
-from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
-from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
+from spacy.language import (
+    DEFAULT_CONFIG,
+    DEFAULT_CONFIG_DISTILL_PATH,
+    DEFAULT_CONFIG_PRETRAIN_PATH,
+    Language,
+)
+from spacy.ml.models import (
+    MaxoutWindowEncoder,
+    MultiHashEmbed,
+    build_tb_parser_model,
+    build_Tok2Vec_model,
+)
 from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
-from spacy.util import load_config, load_config_from_str
-from spacy.util import load_model_from_config, registry
+from spacy.util import (
+    load_config,
+    load_config_from_str,
+    load_model_from_config,
+    registry,
+)
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 39fbbf58217..d5f2f13af4f 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -8,9 +8,14 @@
 from spacy import Vocab, load, registry
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import DependencyParser, EntityRecognizer
-from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer
-from spacy.pipeline import TrainablePipe
+from spacy.pipeline import (
+    DependencyParser,
+    EntityRecognizer,
+    SentenceRecognizer,
+    Tagger,
+    TextCategorizer,
+    TrainablePipe,
+)
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 7b729d78f21..a47f03e8ab4 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,19 +1,31 @@
 import math
 import os
+import time
 from collections import Counter
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 
+import numpy
 import pytest
 import srsly
 from click import NoSuchOption
 from packaging.specifiers import SpecifierSet
-from thinc.api import Config
+from thinc.api import Config, ConfigValidationError
 
 import spacy
 from spacy import about
-from spacy.cli import download_module, info
-from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory
+from spacy.cli import info
+from spacy.cli._util import (
+    download_file,
+    is_subpath_of,
+    load_project_config,
+    parse_config_overrides,
+    string_to_list,
+    substitute_project_variables,
+    upload_file,
+    validate_project_commands,
+    walk_directory,
+)
 from spacy.cli.apply import apply
 from spacy.cli.debug_data import (
     _compile_gold,
@@ -31,6 +43,8 @@
 from spacy.cli.init_config import RECOMMENDATIONS, fill_config, init_config
 from spacy.cli.init_pipeline import _init_labels
 from spacy.cli.package import _is_permitted_package_name, get_third_party_dependencies
+from spacy.cli.project.remote_storage import RemoteStorage
+from spacy.cli.project.run import _check_requirements
 from spacy.cli.validate import get_model_pkgs
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 1789d60ea4c..32ca639b37d 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -7,7 +7,7 @@
 from typer.testing import CliRunner
 
 from spacy.cli._util import app, get_git_version
-from spacy.tokens import Doc, DocBin, Span
+from spacy.tokens import Doc, DocBin
 
 from .util import make_tempdir, normalize_whitespace
 
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index e4b06893c93..25352d2bb16 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -4,7 +4,7 @@
 from unittest import mock
 
 import pytest
-from thinc.api import CupyOps, NumpyOps, get_current_ops
+from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
 
 import spacy
 from spacy.lang.de import German
@@ -13,12 +13,14 @@
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, Span
 from spacy.training import Example
-from spacy.lang.en import English
-from spacy.lang.de import German
-from spacy.util import registry, ignore_error, raise_error, find_matching_language
-from spacy.util import load_model_from_config
-import spacy
-from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
+from spacy.util import (
+    find_matching_language,
+    ignore_error,
+    load_model_from_config,
+    raise_error,
+    registry,
+)
+from spacy.vocab import Vocab
 
 from .util import add_vecs_to_vocab, assert_docs_equal
 
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 160e2ecf335..c05ef625e11 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,13 +1,19 @@
 import ctypes
 import os
 from pathlib import Path
-from spacy.about import __version__ as spacy_version
-from spacy import util
-from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
-from spacy.util import find_available_port
-from thinc.api import Config, Optimizer, ConfigValidationError
-from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
+
+import pytest
+from pydantic import ValidationError
+from thinc.api import (
+    Config,
+    ConfigValidationError,
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+    set_current_ops,
+)
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
diff --git a/spacy/tests/test_symbols.py b/spacy/tests/test_symbols.py
index fb034accac2..2c2fcef755e 100644
--- a/spacy/tests/test_symbols.py
+++ b/spacy/tests/test_symbols.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.symbols import IDS, NAMES
 
 V3_SYMBOLS = {
diff --git a/spacy/tests/training/test_loop.py b/spacy/tests/training/test_loop.py
index 46d01509504..9140421b46b 100644
--- a/spacy/tests/training/test_loop.py
+++ b/spacy/tests/training/test_loop.py
@@ -1,11 +1,13 @@
 from typing import Callable, Iterable, Iterator
+
 import pytest
+from thinc.api import Config
+
 from spacy import Language
 from spacy.training import Example
 from spacy.training.initialize import init_nlp_student
 from spacy.training.loop import distill, train
 from spacy.util import load_model_from_config, registry
-from thinc.api import Config
 
 
 @pytest.fixture
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index ef20ec365c6..e8a19947606 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -8,10 +8,17 @@
 import spacy
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
-from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
-from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
-from spacy.training import offsets_to_biluo_tags, validate_distillation_examples
-from spacy.training.alignment_array import AlignmentArray
+from spacy.training import (
+    Alignment,
+    Corpus,
+    Example,
+    biluo_tags_to_offsets,
+    biluo_tags_to_spans,
+    docs_to_json,
+    iob_to_biluo,
+    offsets_to_biluo_tags,
+    validate_distillation_examples,
+)
 from spacy.training.align import get_alignments
 from spacy.training.alignment_array import AlignmentArray
 from spacy.training.converters import json_to_docs
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 58d30c3202f..b2e50969462 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -2,12 +2,7 @@ from cymem.cymem cimport Pool
 from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 
-from .typedefs cimport hash_t
-from .structs cimport LexemeC, SpanC, TokenC
-from .tokens.doc cimport Doc
-from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
-from .strings cimport StringStore
 from .structs cimport LexemeC, SpanC, TokenC
 from .tokens.doc cimport Doc
 from .typedefs cimport hash_t
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 12a78d39fc4..94397b22d9d 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -1,4 +1,4 @@
-# cython: embedsignature=True, binding=True
+# cython: embedsignature=True, profile=True, binding=True
 cimport cython
 from cymem.cymem cimport Pool
 from cython.operator cimport dereference as deref
@@ -9,17 +9,11 @@ from preshed.maps cimport PreshMap
 
 import re
 
-from .tokens.doc cimport Doc
-from .strings cimport hash_string
 from .lexeme cimport EMPTY_LEXEME
 from .strings cimport hash_string
 from .tokens.doc cimport Doc
 
-from .attrs import intify_attrs
-from .symbols import ORTH, NORM
-from .errors import Errors
 from . import util
-from .util import get_words_and_spaces
 from .attrs import intify_attrs
 from .errors import Errors
 from .scorer import Scorer
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index e5a244360e3..7617e462fde 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -1,9 +1,9 @@
 from ._serialize import DocBin
 from .doc import Doc
+from .doc_bin import DocBin
 from .morphanalysis import MorphAnalysis
 from .span import Span
 from .span_group import SpanGroup
-from .doc_bin import DocBin
-from .morphanalysis import MorphAnalysis
+from .token import Token
 
 __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 2b39d5baa28..dc7c0143029 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -8,7 +8,6 @@ from typing import (
     List,
     Optional,
     Protocol,
-    Sequence,
     Tuple,
     Union,
     overload,
@@ -17,20 +16,15 @@ from typing import (
 import numpy as np
 from cymem.cymem import Pool
 from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
-from .span import Span
-from .token import Token
-from .span_groups import SpanGroups
-from .retokenizer import Retokenizer
+
 from ..lexeme import Lexeme
 from ..vocab import Vocab
-from ._dict_proxies import SpanGroups
-from ._retokenize import Retokenizer
+from .retokenizer import Retokenizer
 from .span import Span
+from .span_groups import SpanGroups
 from .token import Token
 from .underscore import Underscore
 
-DOCBIN_ALL_ATTRS: Tuple[str, ...]
-
 class DocMethod(Protocol):
     def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ...  # type: ignore[misc]
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 893ba9c2cda..a2501003bb8 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -20,15 +20,8 @@ from thinc.util import copy_array
 
 from .span cimport Span
 from .token cimport MISSING_DEP
-from .span_groups import SpanGroups
-from .token cimport Token
-from ..lexeme cimport Lexeme, EMPTY_LEXEME
-from ..typedefs cimport attr_t, flags_t
-from ..attrs cimport attr_id_t
-from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
-from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 
-from ._dict_proxies import SpanGroups
+from .span_groups import SpanGroups
 
 from ..attrs cimport (
     DEP,
@@ -42,7 +35,6 @@ from ..attrs cimport (
     LENGTH,
     MORPH,
     NORM,
-    ORTH,
     POS,
     SENT_START,
     SPACY,
@@ -50,22 +42,17 @@ from ..attrs cimport (
     attr_id_t,
 )
 from ..lexeme cimport EMPTY_LEXEME, Lexeme
-from ..typedefs cimport attr_t
+from ..typedefs cimport attr_t, flags_t
 from .token cimport Token
 
 from .. import parts_of_speech, schemas, util
 from ..attrs import IDS, intify_attr
-from ..compat import copy_reg
+from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
 from ..morphology import Morphology
-from .. import util
-from .. import parts_of_speech
-from .. import schemas
-from .underscore import Underscore, get_ext_args
-from .retokenizer import Retokenizer
-from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from ..util import get_words_and_spaces
-from ._retokenize import Retokenizer
+from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
+from .retokenizer import Retokenizer
 from .underscore import Underscore, get_ext_args
 
 DEF PADDING = 5
diff --git a/spacy/tokens/doc_bin.py b/spacy/tokens/doc_bin.py
index 8a08864d46e..4dda40a05ee 100644
--- a/spacy/tokens/doc_bin.py
+++ b/spacy/tokens/doc_bin.py
@@ -10,7 +10,9 @@
 from ..attrs import IDS, ORTH, SPACY, intify_attr
 from ..compat import copy_reg
 from ..errors import Errors
-from ..util import ensure_path, SimpleFrozenList
+from ..util import SimpleFrozenList, ensure_path
+from ..vocab import Vocab
+from .doc import Doc
 from .span_groups import SpanGroups
 
 # fmt: off
diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx
index 22ce18181a7..7ded04500a3 100644
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@@ -1,10 +1,9 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
-# cython: profile=False
 from typing import Generator, List, Tuple
 
 cimport cython
 from cython.operator cimport dereference
-from libc.stdint cimport int32_t
+from libc.stdint cimport int32_t, int64_t
 from libcpp.pair cimport pair
 from libcpp.unordered_map cimport unordered_map
 from libcpp.unordered_set cimport unordered_set
@@ -12,12 +11,13 @@ from libcpp.unordered_set cimport unordered_set
 import weakref
 
 from murmurhash.mrmr cimport hash64
+from preshed.maps cimport map_get_unless_missing
 
 from .. import Errors
 
-from ..typedefs cimport hash_t
 from ..strings cimport get_string_id
 from ..structs cimport EdgeC, GraphC
+from ..typedefs cimport hash_t
 
 from .token import Token
 
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
index f866488ecc2..73922c62b9b 100644
--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@@ -1,8 +1,9 @@
-from ..vocab cimport Vocab
-from ..typedefs cimport hash_t
-from ..morphology cimport MorphAnalysisC
 from libcpp.memory cimport shared_ptr
 
+from ..morphology cimport MorphAnalysisC
+from ..typedefs cimport hash_t
+from ..vocab cimport Vocab
+
 
 cdef class MorphAnalysis:
     cdef readonly Vocab vocab
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index ceaa3ecd04e..014c01a2f74 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -1,17 +1,15 @@
-# cython: profile=False
 cimport numpy as np
 from libc.string cimport memset
 
 from ..errors import Errors
 from ..morphology import Morphology
 
-from ..morphology cimport check_feature, get_by_field, list_features
+from cython.operator cimport dereference as deref
+from libcpp.memory cimport shared_ptr
+
+from ..morphology cimport MorphAnalysisC, check_feature, get_by_field, list_features
 from ..typedefs cimport attr_t, hash_t
 from ..vocab cimport Vocab
-from ..typedefs cimport hash_t, attr_t
-from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
-from libcpp.memory cimport shared_ptr
-from cython.operator cimport dereference as deref
 
 
 cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx
index c0052ca9a9a..7b6501d4442 100644
--- a/spacy/tokens/retokenizer.pyx
+++ b/spacy/tokens/retokenizer.pyx
@@ -1,6 +1,7 @@
-# cython: infer_types=True, bounds_check=False
+# cython: infer_types=True, bounds_check=False, profile=True
 from cymem.cymem cimport Pool
-from libc.string cimport memset
+from libc.stdlib cimport free, malloc
+from libc.string cimport memcpy, memset
 
 import numpy
 from thinc.api import get_array_module
@@ -9,12 +10,15 @@ from ..attrs cimport MORPH, NORM
 from ..lexeme cimport EMPTY_LEXEME, Lexeme
 from ..structs cimport LexemeC, TokenC
 from ..vocab cimport Vocab
-from .doc cimport Doc, set_children_from_heads, token_by_start
+from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start
 from .span cimport Span
 from .token cimport Token
 
 from ..attrs import intify_attrs
 from ..errors import Errors
+from ..util import SimpleFrozenDict
+from .underscore import is_writable_attr
+
 from ..strings cimport get_string_id
 
 
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index ce318ed0dfb..fb592e68bd8 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,5 +1,5 @@
-from libcpp.memory cimport shared_ptr
 cimport numpy as np
+from libcpp.memory cimport shared_ptr
 
 from ..structs cimport SpanC
 from ..typedefs cimport attr_t
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 6b7782b788b..c574d86372c 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -3,17 +3,20 @@ cimport numpy as np
 from libc.math cimport sqrt
 from libcpp.memory cimport make_shared
 
+import copy
+import warnings
+
 import numpy
 from thinc.api import get_array_module
 
 from ..attrs cimport *
-from ..attrs cimport ORTH, attr_id_t
+from ..attrs cimport attr_id_t
 from ..lexeme cimport Lexeme
-from ..structs cimport TokenC
+from ..parts_of_speech cimport univ_pos_t
+from ..structs cimport LexemeC, TokenC
 from ..symbols cimport dep
-from ..typedefs cimport attr_t, hash_t
-from .doc cimport _get_lca_matrix, get_token_attr
-from .token cimport Token
+from ..typedefs cimport attr_t, flags_t, hash_t
+from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
 
 from ..errors import Errors, Warnings
 from ..util import normalize_slice
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 8a524926a03..bc5bb92d38c 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -1,17 +1,16 @@
-# cython: profile=False
 import struct
 import weakref
 from copy import deepcopy
-from typing import Iterable, Optional, Union
+from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
 
 import srsly
 
 from spacy.errors import Errors
 
-from .span cimport Span
-from libc.stdint cimport uint64_t, uint32_t, int32_t
 from libcpp.memory cimport make_shared
 
+from .span cimport Span
+
 
 cdef class SpanGroup:
     """A group of spans that all belong to the same Doc object. The group
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 6c4806ff9cb..7e9c1ef4b50 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -2,11 +2,13 @@
 # cython: profile=False
 # Compiler crashes on memory view coercion without this. Should report bug.
 cimport numpy as np
+from cython.view cimport array as cvarray
 
 np.import_array()
 
 import warnings
 
+import numpy
 from thinc.api import get_array_module
 
 from ..attrs cimport (
@@ -27,7 +29,6 @@ from ..attrs cimport (
     LIKE_EMAIL,
     LIKE_NUM,
     LIKE_URL,
-    ORTH,
 )
 from ..lexeme cimport Lexeme
 from ..symbols cimport conj
@@ -39,6 +40,7 @@ from .. import parts_of_speech
 from ..attrs import IOB_STRINGS
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
+
 from cython.operator cimport dereference as deref
 
 
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 358b2bd806d..adfc2bb6658 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,11 +1,9 @@
-from .corpus import Corpus, JsonlCorpus  # noqa: F401
-from .example import Example, validate_examples, validate_get_examples  # noqa: F401
-from .example import validate_distillation_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
 from .callbacks import create_copy_from_base_model  # noqa: F401
 from .corpus import Corpus, JsonlCorpus, PlainTextCorpus  # noqa: F401
+from .example import validate_distillation_examples  # noqa: F401
 from .example import Example, validate_examples, validate_get_examples  # noqa: F401
 from .gold_io import docs_to_json, read_json_file  # noqa: F401
 from .iob_utils import (  # noqa: F401
@@ -19,28 +17,3 @@
     tags_to_entities,
 )
 from .loggers import console_logger  # noqa: F401
-
-__all__ = [
-    "Alignment",
-    "Corpus",
-    "Example",
-    "JsonlCorpus",
-    "PlainTextCorpus",
-    "biluo_tags_to_offsets",
-    "biluo_tags_to_spans",
-    "biluo_to_iob",
-    "create_copy_from_base_model",
-    "docs_to_json",
-    "dont_augment",
-    "iob_to_biluo",
-    "minibatch_by_padded_size",
-    "minibatch_by_words",
-    "offsets_to_biluo_tags",
-    "orth_variants_augmenter",
-    "read_json_file",
-    "remove_bilu_prefix",
-    "split_bilu_label",
-    "tags_to_entities",
-    "validate_get_examples",
-    "validate_examples",
-]
diff --git a/spacy/training/align.pyx b/spacy/training/align.pyx
index c68110e304f..79fec73c411 100644
--- a/spacy/training/align.pyx
+++ b/spacy/training/align.pyx
@@ -1,4 +1,3 @@
-# cython: profile=False
 import re
 from itertools import chain
 from typing import List, Tuple
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index 469bb263016..21f1b29f5a2 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,4 +1,17 @@
 import itertools
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    TypeVar,
+    Union,
+)
+
 from thinc.schedules import Schedule
 
 from ..util import minibatch, registry
diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index 21c3d56a118..c2f3b8b51fa 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -1,11 +1,9 @@
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import Callable, Optional
 
 from ..errors import Errors
+from ..language import Language
 from ..util import load_model, logger, registry
 
-if TYPE_CHECKING:
-    from ..language import Language
-
 
 @registry.callbacks("spacy.copy_from_base_model.v1")
 def create_copy_from_base_model(
diff --git a/spacy/training/converters/json_to_docs.py b/spacy/training/converters/json_to_docs.py
index 1ff7a64e09d..a78c39aea7b 100644
--- a/spacy/training/converters/json_to_docs.py
+++ b/spacy/training/converters/json_to_docs.py
@@ -1,9 +1,13 @@
 import srsly
-from ..gold_io import json_iterate, json_to_annotations
-from ..example import annotations_to_doc
-from ..example import _fix_legacy_dict_data, _parse_example_dict_data
-from ...util import load_model
+
 from ...lang.mul import MultiLanguage
+from ...util import load_model
+from ..example import (
+    _fix_legacy_dict_data,
+    _parse_example_dict_data,
+    annotations_to_doc,
+)
+from ..gold_io import json_iterate, json_to_annotations
 
 
 def json_to_docs(input_data, model=None, **kwargs):
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index b2c93f24bfa..914e877f579 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,5 +1,6 @@
 # cython: profile=False
 from collections.abc import Iterable as IterableInstance
+
 import numpy
 
 from murmurhash.mrmr cimport hash64
diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index afbdf463110..a42e8f6425b 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -1,4 +1,4 @@
-# cython: profile=False
+import json
 import warnings
 
 import srsly
@@ -6,7 +6,7 @@ import srsly
 from .. import util
 from ..errors import Warnings
 from ..tokens import Doc
-from .iob_utils import offsets_to_biluo_tags
+from .iob_utils import offsets_to_biluo_tags, tags_to_entities
 
 
 def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 61ad1c09cc0..781614c34d0 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -1,9 +1,3 @@
-from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING
-from thinc.api import Config, ConfigValidationError
-from pathlib import Path
-import srsly
-import numpy
-import tarfile
 import gzip
 import tarfile
 import warnings
@@ -15,14 +9,27 @@
 import numpy
 import srsly
 import tqdm
-from thinc.api import Config, ConfigValidationError, fix_random_seed, set_gpu_allocator
+from thinc.api import Config, ConfigValidationError
 
 from ..errors import Errors, Warnings
+from ..lookups import Lookups
 from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
-from ..util import registry, load_model_from_config, resolve_dot_names, logger
-from ..util import load_model, ensure_path, get_sourced_components
-from ..util import OOV_RANK, DEFAULT_OOV_PROB
-from ..util import set_gpu_allocator_from_config, set_seed_from_config
+from ..util import (
+    DEFAULT_OOV_PROB,
+    OOV_RANK,
+    ensure_path,
+    get_sourced_components,
+    load_model,
+    load_model_from_config,
+    logger,
+    registry,
+    resolve_dot_names,
+    set_gpu_allocator_from_config,
+    set_seed_from_config,
+)
+from ..vectors import Mode as VectorsMode
+from ..vectors import Vectors
+from .pretrain import get_tok2vec_ref
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index ad162678fec..63715ec2c42 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -3,20 +3,34 @@
 import sys
 from pathlib import Path
 from timeit import default_timer as timer
-from thinc.api import Optimizer, Config, constant
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+
+from thinc.api import Config, Optimizer, constant
 from wasabi import Printer
-import random
-import sys
-import shutil
 
-
-from .example import Example
-from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
+from .. import ty
 from ..errors import Errors
+from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
 from ..tokens.doc import Doc
-from .. import ty
-from ..util import resolve_dot_names, registry, logger
-from ..util import set_gpu_allocator_from_config, set_seed_from_config
+from ..util import (
+    logger,
+    registry,
+    resolve_dot_names,
+    set_gpu_allocator_from_config,
+    set_seed_from_config,
+)
+from .example import Example
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
diff --git a/spacy/ty.py b/spacy/ty.py
index ac09cb336ac..e4f34a5f651 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -1,5 +1,17 @@
-from typing import TYPE_CHECKING, Protocol, runtime_checkable
-from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Protocol,
+    Sequence,
+    runtime_checkable,
+)
+
+from thinc.api import Model, Optimizer
 
 if TYPE_CHECKING:
     from .language import Language
diff --git a/spacy/util.py b/spacy/util.py
index 3bb92e7334c..ae9837e3afe 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -2,13 +2,7 @@
 import importlib
 import importlib.metadata
 import importlib.util
-import re
-from pathlib import Path
-import thinc
-from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
-from thinc.api import ConfigValidationError, Model, constant as constant_schedule
-from thinc.api import fix_random_seed, set_gpu_allocator
-import functools
+import inspect
 import itertools
 import logging
 import os
@@ -55,15 +49,9 @@
 from packaging.requirements import Requirement
 from packaging.specifiers import InvalidSpecifier, SpecifierSet
 from packaging.version import InvalidVersion, Version
-from thinc.api import (
-    Adam,
-    Config,
-    ConfigValidationError,
-    Model,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-)
+from thinc.api import Adam, Config, ConfigValidationError, Model, NumpyOps, Optimizer
+from thinc.api import constant as constant_schedule
+from thinc.api import fix_random_seed, get_current_ops, set_gpu_allocator
 
 try:
     import cupy.random
@@ -71,12 +59,9 @@
     cupy = None
 
 
-from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows
-from .errors import Errors, Warnings
 from . import about
-from .compat import CudaStream, cupy, importlib_metadata, is_windows
-from .errors import OLD_MODEL_SHORTCUTS, Errors, Warnings
+from .compat import CudaStream, cupy, is_windows
+from .errors import Errors, Warnings
 from .symbols import ORTH
 
 if TYPE_CHECKING:
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index e16efd2738d..876c56bed1d 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,15 +1,13 @@
-# cython: infer_types=True, binding=True
-from typing import Callable
-
+cimport numpy as np
 from cython.operator cimport dereference as deref
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.set cimport set as cppset
 from murmurhash.mrmr cimport hash128_x64
 
+import functools
 import warnings
 from enum import Enum
-from pathlib import Path
-from typing import TYPE_CHECKING, Union, cast
+from typing import cast
 
 import numpy
 import srsly
@@ -21,13 +19,9 @@ from .attrs cimport ORTH, attr_id_t
 from .strings cimport StringStore
 
 from . import util
-from .attrs import IDS
 from .errors import Errors, Warnings
 from .strings import get_string_id
 
-if TYPE_CHECKING:
-    from .vocab import Vocab  # noqa: F401  # no-cython-lint
-
 
 def unpickle_vectors(bytes_data):
     return Vectors().from_bytes(bytes_data)
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 3ccfa6db622..3ff7e3d69c4 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -1,5 +1,7 @@
 import functools
 
+import functools
+
 import numpy
 import srsly
 from thinc.api import get_array_module, get_current_ops
@@ -16,6 +18,7 @@ from .errors import Errors
 from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
 from .lang.norm_exceptions import BASE_NORMS
 from .lookups import Lookups
+from .util import registry
 from .vectors import Mode as VectorsMode
 from .vectors import Vectors
 

From d171288e58c859abb71bb34c61c6c0c5c464f96a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 12:43:21 +0200
Subject: [PATCH 074/504] Fix span <-> underscore import cycle

---
 spacy/tokens/underscore.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index 63706851286..c3e3641d454 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -3,10 +3,10 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 from ..errors import Errors
-from .span import Span
 
 if TYPE_CHECKING:
     from .doc import Doc
+    from .span import Span
     from .token import Token
 
 
@@ -40,7 +40,10 @@ def __init__(
         object.__setattr__(self, "_doc", obj.doc)
         object.__setattr__(self, "_start", start)
         object.__setattr__(self, "_end", end)
-        if type(obj) == Span:
+        # We used to check if obj is a span, however, this introduces an
+        # import cycle between the span and underscore modeles. So we
+        # do a structural type check instead.
+        if hasattr(obj, "id") and hasattr(obj, "label") and hasattr(obj, "kb_id"):
             object.__setattr__(self, "_label", label)
             object.__setattr__(self, "_kb_id", kb_id)
             object.__setattr__(self, "_span_id", span_id)

From d1dac2b16636d92c0310c12a69251684626e9dfc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 12:43:45 +0200
Subject: [PATCH 075/504] Fix training.callbacks <-> language import cycle

---
 spacy/training/callbacks.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index c2f3b8b51fa..21c3d56a118 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -1,9 +1,11 @@
-from typing import Callable, Optional
+from typing import TYPE_CHECKING, Callable, Optional
 
 from ..errors import Errors
-from ..language import Language
 from ..util import load_model, logger, registry
 
+if TYPE_CHECKING:
+    from ..language import Language
+
 
 @registry.callbacks("spacy.copy_from_base_model.v1")
 def create_copy_from_base_model(

From 1c04cac03269db6ec29c35786a0e62032573d39f Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 6 Jul 2023 15:20:13 +0200
Subject: [PATCH 076/504] Disallow False for first/last arguments of add_pipe
 (#12793)

* Literal True for first/last options

* add test case

* update docs

* remove old redundant test case

* black formatting

* use Optional typing in docstrings

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>

---------

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 spacy/errors.py                           |  1 +
 spacy/language.py                         | 20 ++++++++++++--------
 spacy/tests/pipeline/test_pipe_methods.py | 18 ++++++++++++++++--
 website/docs/api/language.mdx             |  7 ++++---
 4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 4909371d549..2ddaef19bca 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -989,6 +989,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
              "{existing_value}.")
     E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
+    E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
 
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
diff --git a/spacy/language.py b/spacy/language.py
index ea641224684..5b2652db53b 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -763,8 +763,8 @@ def add_pipe(
         *,
         before: Optional[Union[str, int]] = None,
         after: Optional[Union[str, int]] = None,
-        first: Optional[bool] = None,
-        last: Optional[bool] = None,
+        first: Optional[Literal[True]] = None,
+        last: Optional[Literal[True]] = None,
         source: Optional["Language"] = None,
         config: Dict[str, Any] = SimpleFrozenDict(),
         raw_config: Optional[Config] = None,
@@ -783,8 +783,8 @@ def add_pipe(
             component directly before.
         after (Union[str, int]): Name or index of the component to insert new
             component directly after.
-        first (bool): If True, insert component first in the pipeline.
-        last (bool): If True, insert component last in the pipeline.
+        first (Optional[Literal[True]]): If True, insert component first in the pipeline.
+        last (Optional[Literal[True]]): If True, insert component last in the pipeline.
         source (Language): Optional loaded nlp object to copy the pipeline
             component from.
         config (Dict[str, Any]): Config parameters to use for this component.
@@ -830,18 +830,22 @@ def _get_pipe_index(
         self,
         before: Optional[Union[str, int]] = None,
         after: Optional[Union[str, int]] = None,
-        first: Optional[bool] = None,
-        last: Optional[bool] = None,
+        first: Optional[Literal[True]] = None,
+        last: Optional[Literal[True]] = None,
     ) -> int:
         """Determine where to insert a pipeline component based on the before/
         after/first/last values.
 
         before (str): Name or index of the component to insert directly before.
         after (str): Name or index of component to insert directly after.
-        first (bool): If True, insert component first in the pipeline.
-        last (bool): If True, insert component last in the pipeline.
+        first (Optional[Literal[True]]): If True, insert component first in the pipeline.
+        last (Optional[Literal[True]]): If True, insert component last in the pipeline.
         RETURNS (int): The index of the new pipeline component.
         """
+        if first is not None and first is not True:
+            raise ValueError(Errors.E4009.format(attr="first", value=first))
+        if last is not None and last is not True:
+            raise ValueError(Errors.E4009.format(attr="last", value=last))
         all_args = {"before": before, "after": after, "first": first, "last": last}
         if sum(arg is not None for arg in [before, after, first, last]) >= 2:
             raise ValueError(
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 39611a74278..063e5bf67fd 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -189,6 +189,22 @@ def test_add_pipe_last(nlp, name1, name2):
     assert nlp.pipeline[-1][0] == name1
 
 
+@pytest.mark.parametrize("name1,name2", [("parser", "lambda_pipe")])
+def test_add_pipe_false(nlp, name1, name2):
+    Language.component("new_pipe2", func=lambda doc: doc)
+    nlp.add_pipe("new_pipe2", name=name2)
+    with pytest.raises(
+        ValueError,
+        match="The 'last' parameter should be 'None' or 'True', but found 'False'",
+    ):
+        nlp.add_pipe("new_pipe", name=name1, last=False)
+    with pytest.raises(
+        ValueError,
+        match="The 'first' parameter should be 'None' or 'True', but found 'False'",
+    ):
+        nlp.add_pipe("new_pipe", name=name1, first=False)
+
+
 def test_cant_add_pipe_first_and_last(nlp):
     with pytest.raises(ValueError):
         nlp.add_pipe("new_pipe", first=True, last=True)
@@ -411,8 +427,6 @@ def test_add_pipe_before_after():
         nlp.add_pipe("entity_ruler", before="ner", after=2)
     with pytest.raises(ValueError):
         nlp.add_pipe("entity_ruler", before=True)
-    with pytest.raises(ValueError):
-        nlp.add_pipe("entity_ruler", first=False)
 
 
 def test_disable_enable_pipes():
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 82cb1c14cef..d65ea376431 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -436,7 +436,8 @@ component factory registered using
 [`@Language.component`](/api/language#component) or
 [`@Language.factory`](/api/language#factory). Components should be callables
 that take a `Doc` object, modify it and return it. Only one of `before`,
-`after`, `first` or `last` can be set. Default behavior is `last=True`.
+`after`, `first` or `last` can be set. The arguments `first` and `last` can
+either be `None` or `True`. Default behavior is `last=True`.
 
 <Infobox title="Changed in v3.0" variant="warning">
 
@@ -471,8 +472,8 @@ component, adds it to the pipeline and returns it.
 | _keyword-only_                        |                                                                                                                                                                                                                                                                                          |
 | `before`                              | Component name or index to insert component directly before. ~~Optional[Union[str, int]]~~                                                                                                                                                                                               |
 | `after`                               | Component name or index to insert component directly after. ~~Optional[Union[str, int]]~~                                                                                                                                                                                                |
-| `first`                               | Insert component first / not first in the pipeline. ~~Optional[bool]~~                                                                                                                                                                                                                   |
-| `last`                                | Insert component last / not last in the pipeline. ~~Optional[bool]~~                                                                                                                                                                                                                     |
+| `first`                               | Insert component first in the pipeline if set to `True`. ~~Optional[Literal[True]]~~                                                                                                                                                                                                     |
+| `last`                                | Insert component last in the pipeline if set to `True`. ~~Optional[Literal[True]]~~                                                                                                                                                                                                      |
 | `config` <Tag variant="new">3</Tag>   | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Dict[str, Any]~~                                                                                                                                    |
 | `source` <Tag variant="new">3</Tag>   | Optional source pipeline to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source pipeline match the target pipeline. ~~Optional[Language]~~ |
 | `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~                                                                                                                                                           |

From 2fcbc85cceeae4c14b8a6ef1951da8b630f9ebfd Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 19 Jul 2023 16:38:29 +0200
Subject: [PATCH 077/504] merge fixes

---
 .../_parser_internals/_beam_utils.pyx         |  4 +-
 spacy/pipeline/morphologizer.pyx              |  1 -
 spacy/pipeline/transition_parser.pyx          | 27 ++++------
 spacy/tests/pipeline/test_tok2vec.py          | 54 +++++++++++++++++++
 .../tests/serialize/test_serialize_config.py  |  1 +
 spacy/tokens/span.pyx                         |  3 +-
 spacy/tokens/token.pyx                        |  2 +-
 spacy/vectors.pyx                             |  2 +-
 8 files changed, 69 insertions(+), 25 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index 7098b822ef0..7c546752d80 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -4,9 +4,7 @@ cimport numpy as np
 
 import numpy
 
-from cpython.ref cimport Py_XDECREF, PyObject
-
-from ...typedefs cimport class_t, hash_t
+from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 7259fc02699..765fd83f111 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -19,7 +19,6 @@ from ..scorer import Scorer
 from ..symbols import POS
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .pipe import deserialize_config
 from .tagger import ActivationsT, Tagger
 
 # See #9050
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index d521aeced7f..8e4bee2b3dd 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -7,15 +7,9 @@ from typing import Dict, Iterable, List, Optional, Tuple
 cimport numpy as np
 from cymem.cymem cimport Pool
 
-from itertools import islice
-
-from libc.stdlib cimport calloc, free
-from libc.string cimport memcpy, memset
-from libcpp.vector cimport vector
-
 import contextlib
 import random
-import warnings
+from itertools import islice
 
 import numpy
 import numpy.random
@@ -24,29 +18,21 @@ from thinc.api import (
     CupyOps,
     NumpyOps,
     Optimizer,
-    chain,
     get_array_module,
     get_ops,
     set_dropout_rate,
-    softmax_activation,
-    use_ops,
 )
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 
 from ..ml.tb_framework import TransitionModelInputs
 
 from ..tokens.doc cimport Doc
-from ._parser_internals cimport _beam_utils
-from ._parser_internals.search cimport Beam
-from ._parser_internals.stateclass cimport StateC, StateClass
-from .trainable_pipe cimport TrainablePipe
-
-from ._parser_internals import _beam_utils
-
 from ..typedefs cimport weight_t
 from ..vocab cimport Vocab
+from ._parser_internals cimport _beam_utils
+from ._parser_internals.stateclass cimport StateC, StateClass
 from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from .trainable_pipe cimport TrainablePipe
 
 from .. import util
 from ..errors import Errors, Warnings
@@ -62,6 +48,11 @@ cdef extern from "<algorithm>" namespace "std" nogil:
     bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
 
 
+
+# TODO: Remove when we switch to Cython 3.
+cdef extern from "<algorithm>" namespace "std" nogil:
+    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
+
 NUMPY_OPS = NumpyOps()
 
 
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index e557e294112..f6cefbc1f84 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -642,3 +642,57 @@ def tok2vec_distill_wrapper(
 
     student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec)
     student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={})
+
+
+def test_tok2vec_listener_source_link_name():
+    """The component's internal name and the tok2vec listener map correspond
+    to the most recently modified pipeline.
+    """
+    orig_config = Config().from_str(cfg_string_multi)
+    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+
+    nlp2 = English()
+    nlp2.add_pipe("tok2vec", source=nlp1)
+    nlp2.add_pipe("tagger", name="tagger2", source=nlp1)
+
+    # there is no way to have the component have the right name for both
+    # pipelines, right now the most recently modified pipeline is prioritized
+    assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2"
+
+    # there is no way to have the tok2vec have the right listener map for both
+    # pipelines, right now the most recently modified pipeline is prioritized
+    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
+    nlp2.add_pipe("ner", name="ner3", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"]
+    nlp2.remove_pipe("ner3")
+    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
+    nlp2.remove_pipe("tagger2")
+    assert nlp2.get_pipe("tok2vec").listening_components == []
+
+    # at this point the tok2vec component corresponds to nlp2
+    assert nlp1.get_pipe("tok2vec").listening_components == []
+
+    # modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1
+    nlp1.add_pipe("sentencizer")
+    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+
+    # modifying nlp2 syncs it back to nlp2
+    nlp2.add_pipe("sentencizer")
+    assert nlp1.get_pipe("tok2vec").listening_components == []
+
+
+def test_tok2vec_listener_source_replace_listeners():
+    orig_config = Config().from_str(cfg_string_multi)
+    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+    nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
+    assert nlp1.get_pipe("tok2vec").listening_components == ["ner"]
+
+    nlp2 = English()
+    nlp2.add_pipe("tok2vec", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == []
+    nlp2.add_pipe("tagger", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == []
+    nlp2.add_pipe("ner", name="ner2", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"]
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 646ce0f5d48..b351ea80121 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -18,6 +18,7 @@
     build_Tok2Vec_model,
 )
 from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
+from spacy.training import Example
 from spacy.util import (
     load_config,
     load_config_from_str,
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index c574d86372c..da93550569e 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -15,8 +15,9 @@ from ..lexeme cimport Lexeme
 from ..parts_of_speech cimport univ_pos_t
 from ..structs cimport LexemeC, TokenC
 from ..symbols cimport dep
-from ..typedefs cimport attr_t, flags_t, hash_t
+from ..typedefs cimport attr_t
 from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
+from .token cimport Token
 
 from ..errors import Errors, Warnings
 from ..util import normalize_slice
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 7e9c1ef4b50..26e571ee802 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -431,7 +431,7 @@ cdef class Token:
         if "vector" in self.doc.user_token_hooks:
             return self.doc.user_token_hooks["vector"](self)
         else:
-            return self.vocab.get_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
+            return self.vocab.get_vector(self.c.lex.orth)
 
     @property
     def vector_norm(self):
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 876c56bed1d..111a9d01e08 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -130,7 +130,7 @@ cdef class Vectors(BaseVectors):
     cdef readonly unicode eow
     cdef readonly attr_id_t attr
 
-    def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
+    def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">", attr="ORTH"):
         """Create a new vector store.
 
         strings (StringStore): The string store.

From 77d7d644c8cbe8730539beb8b1896c7169ba90d6 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 19 Jul 2023 17:41:29 +0200
Subject: [PATCH 078/504] cython fixes and cleanup

---
 spacy/matcher/phrasematcher.pyx               |  2 -
 spacy/ml/tb_framework.pyx                     | 55 ++++++++++---------
 spacy/morphology.pyx                          |  6 +-
 spacy/parts_of_speech.pxd                     |  2 +-
 spacy/pipeline/_parser_internals/ner.pyx      |  1 -
 spacy/pipeline/_parser_internals/search.pxd   |  1 -
 spacy/pipeline/_parser_internals/search.pyx   | 12 ++--
 .../_parser_internals/transition_system.pxd   |  4 +-
 .../_parser_internals/transition_system.pyx   | 21 ++++---
 spacy/pipeline/morphologizer.pyx              |  3 +-
 spacy/pipeline/pipe.pyx                       |  5 +-
 spacy/pipeline/trainable_pipe.pyx             | 17 +++---
 spacy/pipeline/transition_parser.pyx          | 55 ++++++++++---------
 spacy/strings.pyx                             |  9 +--
 spacy/tests/parser/_search.pyx                | 49 +++++++++--------
 spacy/tokens/doc.pyx                          |  2 +-
 spacy/tokens/morphanalysis.pyx                |  1 -
 spacy/tokens/span.pyx                         |  3 +-
 18 files changed, 119 insertions(+), 129 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 107d7d926ee..d1a8eaf33c4 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -160,7 +160,6 @@ cdef class PhraseMatcher:
         del self._callbacks[key]
         del self._docs[key]
 
-
     def _add_from_arrays(self, key, specs, *, on_match=None):
         """Add a preprocessed list of specs, with an optional callback.
 
@@ -196,7 +195,6 @@ cdef class PhraseMatcher:
                 result = internal_node
             map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
 
-
     def add(self, key, docs, *, on_match=None):
         """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
         key, a list of one or more patterns, and (optionally) an on_match callback.
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index fd0af12ceab..ed04045a6a5 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -1,5 +1,5 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, TypeVar, cast
+from typing import Any, List, Optional, Tuple, cast
 
 from libc.stdlib cimport calloc, free, realloc
 from libc.string cimport memcpy, memset
@@ -23,7 +23,7 @@ from thinc.api import (
 
 from thinc.backends.cblas cimport CBlas, saxpy, sgemm
 
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
+from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
 
 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
@@ -136,7 +136,7 @@ def init(
     Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
 ):
     if X is not None:
-        docs, moves = X
+        docs, _ = X
         model.get_ref("tok2vec").initialize(X=docs)
     else:
         model.get_ref("tok2vec").initialize()
@@ -145,7 +145,7 @@ def init(
         current_nO = model.maybe_get_dim("nO")
         if current_nO is None or current_nO != inferred_nO:
             model.attrs["resize_output"](model, inferred_nO)
-    nO = model.get_dim("nO")
+    # nO = model.get_dim("nO")
     nP = model.get_dim("nP")
     nH = model.get_dim("nH")
     nI = model.get_dim("nI")
@@ -192,9 +192,10 @@ class TransitionModelInputs:
         self,
         docs: List[Doc],
         moves: TransitionSystem,
-        actions: Optional[List[Ints1d]]=None,
-        max_moves: int=0,
-        states: Optional[List[State]]=None):
+        actions: Optional[List[Ints1d]] = None,
+        max_moves: int = 0,
+        states: Optional[List[State]] = None,
+    ):
         """
         actions (Optional[List[Ints1d]]): actions to apply for each Doc.
         docs (List[Doc]): Docs to predict transition sequences for.
@@ -234,12 +235,12 @@ def forward(model, inputs: TransitionModelInputs, is_train: bool):
         return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
     else:
         return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
-            feats, backprop_feats, seen_mask, is_train, actions=actions,
-            max_moves=inputs.max_moves)
+                                 feats, backprop_feats, seen_mask, is_train, actions=actions,
+                                 max_moves=inputs.max_moves)
 
 
 def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
-                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+                        np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None):
     cdef vector[StateC*] c_states
     cdef StateClass state
     for state in states:
@@ -257,9 +258,10 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State
 
     return (states, scores), backprop
 
+
 cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
                        WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i, j
+    cdef int i
     cdef vector[StateC *] unfinished
     cdef ActivationsC activations = _alloc_activations(sizes)
     cdef np.ndarray step_scores
@@ -276,7 +278,7 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
             if actions is None:
                 # Validate actions, argmax, take action.
                 c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
-                    sizes.states)
+                                   sizes.states)
             else:
                 c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
             for i in range(sizes.states):
@@ -302,8 +304,8 @@ def _forward_fallback(
     backprop_feats,
     seen_mask,
     is_train: bool,
-    actions: Optional[List[Ints1d]]=None,
-    max_moves: int=0):
+    actions: Optional[List[Ints1d]] = None,
+        max_moves: int = 0):
     nF = model.get_dim("nF")
     output = model.get_ref("output")
     hidden_b = model.get_param("hidden_b")
@@ -371,7 +373,7 @@ def _forward_fallback(
             for clas in set(model.attrs["unseen_classes"]):
                 if (d_scores[:, clas] < 0).any():
                     model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False
+        d_scores *= seen_mask == False  # no-cython-lint
         # Calculate the gradients for the parameters of the output layer.
         # The weight gemm is (nS, nO) @ (nS, nH).T
         output.inc_grad("b", d_scores.sum(axis=0))
@@ -571,13 +573,13 @@ cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
         A._max_size = n.states
     else:
         A.token_ids = <int*>realloc(A.token_ids,
-            n.states * n.feats * sizeof(A.token_ids[0]))
+                                    n.states * n.feats * sizeof(A.token_ids[0]))
         A.unmaxed = <float*>realloc(A.unmaxed,
-            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
         A.hiddens = <float*>realloc(A.hiddens,
-            n.states * n.hiddens * sizeof(A.hiddens[0]))
+                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
         A.is_valid = <int*>realloc(A.is_valid,
-            n.states * n.classes * sizeof(A.is_valid[0]))
+                                   n.states * n.classes * sizeof(A.is_valid[0]))
         A._max_size = n.states
     A._curr_size = n.states
 
@@ -599,9 +601,9 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
     else:
         # Compute hidden-to-output
         sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-                      1.0, <const float *>A.hiddens, n.hiddens,
-                      <const float *>W.hidden_weights, n.hiddens,
-                      0.0, scores, n.classes)
+                     1.0, <const float *>A.hiddens, n.hiddens,
+                     <const float *>W.hidden_weights, n.hiddens,
+                     0.0, scores, n.classes)
         # Add bias
         for i in range(n.states):
             saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
@@ -617,12 +619,12 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
                 scores[i*n.classes+j] = min_
 
 
-cdef void _sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f, i
+cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
+                              const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f
     cdef const float* feature
     cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces
+    cdef int O = n.hiddens * n.pieces  # no-cython-lint
     cdef int F = n.feats
     cdef int T = n.tokens
     padding = cached + (T * F * O)
@@ -637,4 +639,3 @@ cdef void _sum_state_features(CBlas cblas, float* output,
                 feature = &cached[idx]
             saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
         token_ids += F
-
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index d75c1071941..e7f93b78b47 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -80,15 +80,13 @@ cdef class Morphology:
         out.sort(key=lambda x: x[0])
         return dict(out)
 
-
     def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
         norm_feats_string = self.FEATURE_SEP.join([
-                self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
+            self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
             for field, values in feats.items()
-        ])
+            ])
         return norm_feats_string or self.EMPTY_MORPH
 
-
     cdef hash_t _add(self, features):
         """Insert a morphological analysis in the morphology table, if not
         already present. The morphological analysis may be provided in the UD
diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index 01f116ea688..22a571be7b0 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -8,7 +8,7 @@ cpdef enum univ_pos_t:
     ADV = symbols.ADV
     AUX = symbols.AUX
     CONJ = symbols.CONJ
-    CCONJ  = symbols.CCONJ  # U20
+    CCONJ = symbols.CCONJ  # U20
     DET = symbols.DET
     INTJ = symbols.INTJ
     NOUN = symbols.NOUN
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 5c31ff5c21d..3a352f51ff5 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -2,7 +2,6 @@ import os
 import random
 
 from cymem.cymem cimport Pool
-from libc.stdint cimport int32_t
 from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
 
diff --git a/spacy/pipeline/_parser_internals/search.pxd b/spacy/pipeline/_parser_internals/search.pxd
index 4626496335a..ad68dc5c718 100644
--- a/spacy/pipeline/_parser_internals/search.pxd
+++ b/spacy/pipeline/_parser_internals/search.pxd
@@ -57,7 +57,6 @@ cdef class Beam:
     cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
                      void* extra_args) except -1
     cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
- 
 
     cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
         self.scores[i][j] = score
diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
index 251eaa805cb..578299b56ae 100644
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -1,11 +1,8 @@
 # cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
-from libc.math cimport exp, log
-from libc.string cimport memcpy, memset
-
-import math
-
 from cymem.cymem cimport Pool
+from libc.math cimport exp
+from libc.string cimport memcpy, memset
 from preshed.maps cimport PreshMap
 
 
@@ -70,7 +67,7 @@ cdef class Beam:
             self.costs[i][j] = costs[j]
 
     cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
-        cdef int i, j
+        cdef int i
         for i in range(self.width):
             memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
             memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
@@ -176,7 +173,6 @@ cdef class Beam:
         beam-width, and n is the number of classes.
         """
         cdef Entry entry
-        cdef weight_t score
         cdef _State* s
         cdef int i, j, move_id
         assert self.size >= 1
@@ -269,7 +265,7 @@ cdef class MaxViolation:
                 # This can happen from non-monotonic actions
                 # If we find a better gold analysis this way, be sure to keep it.
                 elif pred._states[i].loss <= 0 \
-                and tuple(pred.histories[i]) not in seen_golds:
+                        and tuple(pred.histories[i]) not in seen_golds:
                     g_scores.append(pred._states[i].score)
                     g_hist.append(list(pred.histories[i]))
             for i in range(gold.size):
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 66cc7747b69..08baed932ba 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -60,7 +60,7 @@ cdef class TransitionSystem:
 
 
 cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-    int batch_size) nogil
+                          int batch_size) nogil
 
 cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-        int nr_class, int batch_size) nogil
+                             int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index a433ce7dc75..50b155bf9bb 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -294,19 +294,19 @@ cdef class TransitionSystem:
 
 
 cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-    int batch_size) nogil:
-        cdef int i
-        cdef Transition action
-        cdef StateC* state
-        for i in range(batch_size):
-            state = states[i]
-            action = moves.c[actions[i]]
-            action.do(state, action.label)
-            state.history.push_back(action.clas)
+                          int batch_size) nogil:
+    cdef int i
+    cdef Transition action
+    cdef StateC* state
+    for i in range(batch_size):
+        state = states[i]
+        action = moves.c[actions[i]]
+        action.do(state, action.label)
+        state.history.push_back(action.clas)
 
 
 cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-    int nr_class, int batch_size) nogil:
+                             int nr_class, int batch_size) nogil:
     is_valid = <int*>calloc(moves.n_moves, sizeof(int))
     cdef int i, guess
     cdef Transition action
@@ -322,4 +322,3 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa
             action.do(states[i], action.label)
             states[i].history.push_back(guess)
     free(is_valid)
-
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 765fd83f111..669a5424412 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,8 +1,7 @@
 # cython: infer_types=True, profile=True, binding=True
 from itertools import islice
-from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Callable, Dict, Iterable, Optional, Union
 
-import srsly
 from thinc.api import Config, Model
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 7bc6735a802..8409e64c3cb 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -1,12 +1,11 @@
 # cython: infer_types=True, profile=True, binding=True
-import warnings
-from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union
+from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
 
 import srsly
 
 from ..tokens.doc cimport Doc
 
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..language import Language
 from ..training import Example
 from ..util import raise_error
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index e7cf566a113..065a6c20d62 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -1,5 +1,4 @@
 # cython: infer_types=True, profile=True, binding=True
-import warnings
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 
 import srsly
@@ -8,7 +7,7 @@ from thinc.api import Model, Optimizer, set_dropout_rate
 from ..tokens.doc cimport Doc
 
 from .. import util
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..language import Language
 from ..training import Example, validate_distillation_examples, validate_examples
 from ..vocab import Vocab
@@ -56,14 +55,14 @@ cdef class TrainablePipe(Pipe):
         except Exception as e:
             error_handler(self.name, self, [doc], e)
 
-
     def distill(self,
-               teacher_pipe: Optional["TrainablePipe"],
-               examples: Iterable["Example"],
-               *,
-               drop: float=0.0,
-               sgd: Optional[Optimizer]=None,
-               losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
+                teacher_pipe: Optional["TrainablePipe"],
+                examples: Iterable["Example"],
+                *,
+                drop: float = 0.0,
+                sgd: Optional[Optimizer] = None,
+                losses: Optional[Dict[str, float]] = None
+                ) -> Dict[str, float]:
         """Train a pipe (the student) on the predictions of another pipe
         (the teacher). The student is typically trained on the probability
         distribution of the teacher, but details may differ per pipe.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 8e4bee2b3dd..9fa0d4987b8 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -228,12 +228,13 @@ class Parser(TrainablePipe):
         raise NotImplementedError
 
     def distill(self,
-               teacher_pipe: Optional[TrainablePipe],
-               examples: Iterable["Example"],
-               *,
-               drop: float=0.0,
-               sgd: Optional[Optimizer]=None,
-               losses: Optional[Dict[str, float]]=None):
+                teacher_pipe: Optional[TrainablePipe],
+                examples: Iterable["Example"],
+                *,
+                drop: float = 0.0,
+                sgd: Optional[Optimizer] = None,
+                losses: Optional[Dict[str, float]] = None
+                ):
         """Train a pipe (the student) on the predictions of another pipe
         (the teacher). The student is trained on the transition probabilities
         of the teacher.
@@ -283,11 +284,13 @@ class Parser(TrainablePipe):
         # teacher's distributions.
 
         student_inputs = TransitionModelInputs(docs=student_docs,
-            states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
+                                               states=[state.copy() for state in states],
+                                               moves=self.moves,
+                                               max_moves=max_moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
         actions = _states_diff_to_actions(states, student_states)
         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-            states=states, moves=teacher_pipe.moves, actions=actions)
+                                               states=states, moves=teacher_pipe.moves, actions=actions)
         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@@ -300,10 +303,9 @@ class Parser(TrainablePipe):
 
         return losses
 
-
     def get_teacher_student_loss(
-        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
-        normalize: bool=False,
+            self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
+            normalize: bool = False,
     ) -> Tuple[float, List[Floats2d]]:
         """Calculate the loss and its gradient for a batch of student
         scores, relative to teacher scores.
@@ -326,9 +328,9 @@ class Parser(TrainablePipe):
         # ourselves.
 
         teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
-            axis=-1, inplace=True)
+                                                axis=-1, inplace=True)
         student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
-            axis=-1, inplace=True)
+                                                axis=-1, inplace=True)
 
         assert teacher_scores.shape == student_scores.shape
 
@@ -442,13 +444,15 @@ class Parser(TrainablePipe):
         else:
             init_states, gold_states, _ = self.moves.init_gold_batch(examples)
 
-        inputs = TransitionModelInputs(docs=docs, moves=self.moves,
-            max_moves=max_moves, states=[state.copy() for state in init_states])
+        inputs = TransitionModelInputs(docs=docs,
+                                       moves=self.moves,
+                                       max_moves=max_moves,
+                                       states=[state.copy() for state in init_states])
         (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
         if sum(s.shape[0] for s in scores) == 0:
             return losses
         d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
-            examples, max_moves)
+                                 examples, max_moves)
         backprop_scores((pred_states, d_scores))
         if sgd not in (None, False):
             self.finish_update(sgd)
@@ -489,9 +493,7 @@ class Parser(TrainablePipe):
         cdef TransitionSystem moves = self.moves
         cdef StateClass state
         cdef int clas
-        cdef int nF = self.model.get_dim("nF")
         cdef int nO = moves.n_moves
-        cdef int nS = sum([len(history) for history in histories])
         cdef Pool mem = Pool()
         cdef np.ndarray costs_i
         is_valid = <int*>mem.alloc(nO, sizeof(int))
@@ -558,8 +560,8 @@ class Parser(TrainablePipe):
 
         return losses
 
-    def update_beam(self, examples, *, beam_width,
-            drop=0., sgd=None, losses=None, beam_density=0.0):
+    def update_beam(self, examples, *, beam_width, drop=0.,
+                    sgd=None, losses=None, beam_density=0.0):
         raise NotImplementedError
 
     def set_output(self, nO):
@@ -684,9 +686,10 @@ class Parser(TrainablePipe):
             return states
 
         # Parse the states that are too long with the teacher's parsing model.
-        teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
-            states=[state.copy() for state in to_cut])
-        (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs)
+        teacher_inputs = TransitionModelInputs(docs=docs,
+                                               moves=moves,
+                                               states=[state.copy() for state in to_cut])
+        (teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)
 
         # Step through the teacher's actions and store every state after
         # each multiple of max_length.
@@ -784,6 +787,7 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
 
     return actions
 
+
 def _states_diff_to_actions(
     before_states: List[StateClass],
     after_states: List[StateClass]
@@ -804,8 +808,9 @@ def _states_diff_to_actions(
         c_state_before = before_state.c
         c_state_after = after_state.c
 
-        assert equal(c_state_before.history.begin(), c_state_before.history.end(),
-            c_state_after.history.begin())
+        assert equal(c_state_before.history.begin(),
+                     c_state_before.history.end(),
+                     c_state_after.history.begin())
 
     actions = []
     while True:
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 43826f07c44..28e06a2ecea 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,7 +1,6 @@
 # cython: infer_types=True
-from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Iterable, Iterator, List, Optional, Tuple, Union
 
-cimport cython
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from libcpp.set cimport set
@@ -244,7 +243,6 @@ cdef class StringStore:
         cdef int n_length_bytes
         cdef int i
         cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
-        cdef uint32_t ulength = length
         if length < sizeof(string.s):
             string.s[0] = <unsigned char>length
             memcpy(&string.s[1], chars, length)
@@ -302,7 +300,7 @@ cpdef hash_t get_string_id(object string_or_hash) except -1:
 
     try:
         return hash_string(string_or_hash)
-    except:
+    except:   # no-cython-lint
         if _try_coerce_to_hash(string_or_hash, &str_hash):
             # Coerce the integral key to the expected primitive hash type.
             # This ensures that custom/overloaded "primitive" data types
@@ -319,6 +317,5 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
     try:
         out_hash[0] = key
         return True
-    except:
+    except:  # no-cython-lint
         return False
-
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
index 0983159b75d..cd9e6b2f5ee 100644
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@@ -2,7 +2,7 @@
 from cymem.cymem cimport Pool
 
 from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
-from spacy.typedefs cimport class_t, weight_t
+from spacy.typedefs cimport class_t
 
 import pytest
 
@@ -42,32 +42,35 @@ cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
     state = <TestState*>state
     mem.free(state)
 
+
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width",
-    [
-        (2, 3),
-        (3, 6),
-        (4, 20),
-    ]
-)
+                         [
+                             (2, 3),
+                             (3, 6),
+                             (4, 20),
+                         ]
+                         )
 def test_init(nr_class, beam_width):
     b = Beam(nr_class, beam_width)
     assert b.size == 1
     assert b.width == beam_width
     assert b.nr_class == nr_class
 
+
 @cytest
 def test_init_violn():
     MaxViolation()
 
+
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length",
-    [
-        (2, 3, 3),
-        (3, 6, 15),
-        (4, 20, 32),
-    ]
-)
+                         [
+                             (2, 3, 3),
+                             (3, 6, 15),
+                             (4, 20, 32),
+                         ]
+                         )
 def test_initialize(nr_class, beam_width, length):
     b = Beam(nr_class, beam_width)
     b.initialize(initialize, destroy, length, NULL)
@@ -79,11 +82,11 @@ def test_initialize(nr_class, beam_width, length):
 
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length,extra",
-    [
-        (2, 3, 4, None),
-        (3, 6, 15, u"test beam 1"),
-    ]
-)
+                         [
+                             (2, 3, 4, None),
+                             (3, 6, 15, u"test beam 1"),
+                         ]
+                         )
 def test_initialize_extra(nr_class, beam_width, length, extra):
     b = Beam(nr_class, beam_width)
     if extra is None:
@@ -97,11 +100,11 @@ def test_initialize_extra(nr_class, beam_width, length, extra):
 
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length",
-    [
-        (3, 6, 15),
-        (4, 20, 32),
-    ]
-)
+                         [
+                             (3, 6, 15),
+                             (4, 20, 32),
+                         ]
+                         )
 def test_transition(nr_class, beam_width, length):
     b = Beam(nr_class, beam_width)
     b.initialize(initialize, destroy, length, NULL)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index a2501003bb8..5a70af00e2e 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1755,7 +1755,7 @@ cdef class Doc:
                                     data["underscore_span"] = {}
                                 if attr not in data["underscore_span"]:
                                     data["underscore_span"][attr] = []
-                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id":_span_id})
+                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id": _span_id})
 
             for attr in underscore:
                 if attr not in user_keys:
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index 014c01a2f74..f3841baa24a 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -1,5 +1,4 @@
 cimport numpy as np
-from libc.string cimport memset
 
 from ..errors import Errors
 from ..morphology import Morphology
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index da93550569e..8e490ec83d0 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -242,8 +242,8 @@ cdef class Span:
 
     @property
     def _(self):
-        cdef SpanC* span_c = self.span_c()
         """Custom extension attributes registered via `set_extension`."""
+        cdef SpanC* span_c = self.span_c()
         return Underscore(Underscore.span_extensions, self,
                           start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
 
@@ -950,7 +950,6 @@ cdef class Span:
             self.id_ = ent_id_
 
 
-
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
     # better candidates

From b3e4b924e7e9df9f133f42c95f6219c1a3dd18fb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 Jul 2023 09:59:19 +0200
Subject: [PATCH 079/504] Update spacy/ml/tb_framework.pyx

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 spacy/ml/tb_framework.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index ed04045a6a5..a48c6b901c7 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -305,7 +305,8 @@ def _forward_fallback(
     seen_mask,
     is_train: bool,
     actions: Optional[List[Ints1d]] = None,
-        max_moves: int = 0):
+    max_moves: int = 0,
+):
     nF = model.get_dim("nF")
     output = model.get_ref("output")
     hidden_b = model.get_param("hidden_b")

From 790d10aaec6fa551789daaad0911c3f747e4c296 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 20 Jul 2023 14:08:29 +0200
Subject: [PATCH 080/504] remove unnecessary line

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/ml/tb_framework.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index a48c6b901c7..6c5c29d8549 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -145,7 +145,6 @@ def init(
         current_nO = model.maybe_get_dim("nO")
         if current_nO is None or current_nO != inferred_nO:
             model.attrs["resize_output"](model, inferred_nO)
-    # nO = model.get_dim("nO")
     nP = model.get_dim("nP")
     nH = model.get_dim("nH")
     nI = model.get_dim("nI")

From 3b7f943173d9d8024f8046ba90d19144aed5cd91 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 31 Jul 2023 15:54:35 +0200
Subject: [PATCH 081/504] Recommend lookups tables from URLs or other loaders
 (#12283)

* Recommend lookups tables from URLs or other loaders

Shift away from the `lookups` extra (which isn't removed, just no longer
mentioned) and recommend loading data from the `spacy-lookups-data` repo
or other sources rather than the `spacy-lookups-data` package.

If the tables can't be loaded from the `lookups` registry in the
lemmatizer, show how to specify the tables in `[initialize]` rather than
recommending the `spacy-lookups-data` package.

* Add tests for some rule-based lemmatizers

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/about.py                             |  4 ++
 spacy/errors.py                            | 25 ++++++++---
 spacy/language.py                          |  7 ----
 spacy/lookups.py                           | 26 +++++++++++-
 spacy/pipeline/lemmatizer.py               | 21 +++++++++-
 spacy/tests/pipeline/test_lemmatizer.py    | 16 ++++++-
 website/docs/api/lemmatizer.mdx            |  4 +-
 website/docs/api/top-level.mdx             | 49 ++++++++++++++++++++++
 website/docs/usage/index.mdx               |  7 ++--
 website/docs/usage/linguistic-features.mdx |  6 +--
 website/src/widgets/quickstart-install.js  |  4 --
 11 files changed, 141 insertions(+), 28 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index ec1dde7cae6..73f201af5fb 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -3,3 +3,7 @@
 __version__ = "4.0.0.dev1"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
+__projects__ = "https://github.com/explosion/projects"
+__projects_branch__ = "v3"
+__lookups_tag__ = "v1.0.3"
+__lookups_url__ = f"https://raw.githubusercontent.com/explosion/spacy-lookups-data/{__lookups_tag__}/spacy_lookups_data/data/"
diff --git a/spacy/errors.py b/spacy/errors.py
index 2ddaef19bca..adca5880283 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,6 +1,8 @@
 import warnings
 from typing import Literal
 
+from . import about
+
 
 class ErrorsWithCodes(type):
     def __getattribute__(self, code):
@@ -103,13 +105,14 @@ class Warnings(metaclass=ErrorsWithCodes):
             "table. This may degrade the performance of the model to some "
             "degree. If this is intentional or the language you're using "
             "doesn't have a normalization table, please ignore this warning. "
-            "If this is surprising, make sure you have the spacy-lookups-data "
-            "package installed and load the table in your config. The "
-            "languages with lexeme normalization tables are currently: "
-            "{langs}\n\nLoad the table in your config with:\n\n"
+            "If this is surprising, make sure you are loading the table in "
+            "your config. The languages with lexeme normalization tables are "
+            "currently: {langs}\n\nAn example of how to load a table in "
+            "your config :\n\n"
             "[initialize.lookups]\n"
-            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
+            "@misc = \"spacy.LookupsDataLoaderFromURL.v1\"\n"
             "lang = ${{nlp.lang}}\n"
+             f'url = "{about.__lookups_url__}"\n'
             "tables = [\"lexeme_norm\"]\n")
     W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
             "attribute or operator.")
@@ -990,6 +993,18 @@ class Errors(metaclass=ErrorsWithCodes):
              "{existing_value}.")
     E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
     E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
+    E4010 = ("Required lemmatizer table(s) {missing_tables} not found in "
+             "[initialize] or in registered lookups (spacy-lookups-data). An "
+             "example for how to load lemmatizer tables in [initialize]:\n\n"
+             "[initialize.components]\n\n"
+             "[initialize.components.{pipe_name}]\n\n"
+             "[initialize.components.{pipe_name}.lookups]\n"
+             '@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
+             "lang = ${{nlp.lang}}\n"
+             f'url = "{about.__lookups_url__}"\n'
+             "tables = {tables}\n"
+             "# or required tables only: tables = {required_tables}\n")
+    E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
diff --git a/spacy/language.py b/spacy/language.py
index 5b2652db53b..72d27c598cc 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -128,13 +128,6 @@ def tokenizer_factory(nlp: "Language") -> Tokenizer:
     return tokenizer_factory
 
 
-@registry.misc("spacy.LookupsDataLoader.v1")
-def load_lookups_data(lang, tables):
-    util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
-    lookups = load_lookups(lang=lang, tables=tables)
-    return lookups
-
-
 class Language:
     """A text-processing pipeline. Usually you'll load this once per process,
     and pass the instance around your application.
diff --git a/spacy/lookups.py b/spacy/lookups.py
index 1a2c44bfa1c..e2e92426f6a 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -2,16 +2,40 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
+import requests
 import srsly
 from preshed.bloom import BloomFilter
 
 from .errors import Errors
 from .strings import get_string_id
-from .util import SimpleFrozenDict, ensure_path, load_language_data, registry
+from .util import SimpleFrozenDict, ensure_path, load_language_data, logger, registry
 
 UNSET = object()
 
 
+@registry.misc("spacy.LookupsDataLoader.v1")
+def load_lookups_data(lang, tables):
+    logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
+    lookups = load_lookups(lang=lang, tables=tables)
+    return lookups
+
+
+@registry.misc("spacy.LookupsDataLoaderFromURL.v1")
+def load_lookups_data_from_url(lang, tables, url):
+    logger.debug(f"Loading lookups from {url}: {tables}")
+    lookups = Lookups()
+    for table in tables:
+        table_url = url + lang + "_" + table + ".json"
+        r = requests.get(table_url)
+        if r.status_code != 200:
+            raise ValueError(
+                Errors.E4011.format(status_code=r.status_code, url=table_url)
+            )
+        table_data = r.json()
+        lookups.add_table(table, table_data)
+    return lookups
+
+
 def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
     """Load the data from the spacy-lookups-data package for a given language,
     if available. Returns an empty `Lookups` container if there's no data or if the package
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 09e501595a8..ed9547c745b 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
+import srsly
 from thinc.api import Model
 
 from .. import util
@@ -155,8 +156,24 @@ def initialize(
         """
         required_tables, optional_tables = self.get_lookups_config(self.mode)
         if lookups is None:
-            logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
-            lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
+            logger.debug(
+                "Lemmatizer: no lemmatizer lookups tables provided, "
+                "trying to load tables from registered lookups (usually "
+                "spacy-lookups-data)"
+            )
+            lookups = load_lookups(
+                lang=self.vocab.lang, tables=required_tables, strict=False
+            )
+            missing_tables = set(required_tables) - set(lookups.tables)
+            if len(missing_tables) > 0:
+                raise ValueError(
+                    Errors.E4010.format(
+                        missing_tables=list(missing_tables),
+                        pipe_name=self.name,
+                        required_tables=srsly.json_dumps(required_tables),
+                        tables=srsly.json_dumps(required_tables + optional_tables),
+                    )
+                )
             optional_lookups = load_lookups(
                 lang=self.vocab.lang, tables=optional_tables, strict=False
             )
diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py
index ccc2e0b154a..5385fb5d7dd 100644
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@@ -2,9 +2,11 @@
 
 import pytest
 
+import spacy
 from spacy import registry, util
+from spacy.about import __lookups_url__
 from spacy.lang.en import English
-from spacy.lookups import Lookups
+from spacy.lookups import Lookups, load_lookups_data_from_url
 
 from ..util import make_tempdir
 
@@ -113,3 +115,15 @@ def cope_lookups():
 
     # Make sure that lemmatizer cache can be pickled
     pickle.dumps(lemmatizer2)
+
+
+@pytest.mark.parametrize("lang", ("ca", "en"))
+def test_lemmatizer_load_lookups_from_url(lang):
+    nlp = spacy.blank(lang)
+    lemmatizer = nlp.add_pipe("lemmatizer")
+    req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
+    lookups = load_lookups_data_from_url(
+        nlp.lang, req_tables + opt_tables, __lookups_url__
+    )
+    lemmatizer.initialize(lookups=lookups)
+    assert set(lemmatizer.lookups.tables) == set(req_tables + opt_tables)
diff --git a/website/docs/api/lemmatizer.mdx b/website/docs/api/lemmatizer.mdx
index f6657dbf48c..5bd0112e237 100644
--- a/website/docs/api/lemmatizer.mdx
+++ b/website/docs/api/lemmatizer.mdx
@@ -14,7 +14,7 @@ implement their own lemmatizer components via
 [language-specific factories](/usage/processing-pipelines#factories-language).
 The default data used is provided by the
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
-extension package.
+repository.
 
 For a trainable lemmatizer, see [`EditTreeLemmatizer`](/api/edittreelemmatizer).
 
@@ -174,6 +174,8 @@ training. At runtime, all data is loaded from disk.
 >
 > ```python
 > lemmatizer = nlp.add_pipe("lemmatizer")
+> req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
+> lookups = load_lookups(nlp.lang, req_tables + opt_tables)
 > lemmatizer.initialize(lookups=lookups)
 > ```
 >
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 8555d64ba63..a2d4bbdd387 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -9,6 +9,7 @@ menu:
   - ['Batchers', 'batchers']
   - ['Augmenters', 'augmenters']
   - ['Callbacks', 'callbacks']
+  - ['Miscellaneous', 'misc']
   - ['Training & Alignment', 'gold']
   - ['Utility Functions', 'util']
 ---
@@ -1058,6 +1059,54 @@ methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`,
 | `additional_pipe_functions` | Additional pipeline methods to wrap. Keys are pipeline names and values are lists of method identifiers. Defaults to `None`. ~~Optional[Dict[str, List[str]]]~~ |
 | **CREATES**                 | A function that takes the current `nlp` and wraps pipe models and methods in NVTX ranges. ~~Callable[[Language], Language]~~                                    |
 
+## Miscellaneous {id="misc"}
+
+### spacy.LookupsDataLoader.v1 {id="lookups_data_reader",tag="registered function",version="3"}
+
+> #### Example config
+>
+> ```ini
+> [initialize.lookups]
+> @misc = "spacy.LookupsDataLoader.v1"
+> lang = ${nlp.lang}
+> tables = ["lexeme_prob"]
+> ```
+
+Load the specified tables from the [`lookups` registry](#registry), which are
+provided by a package such as
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
+
+| Name        | Description                                                                                     |
+| ----------- | ----------------------------------------------------------------------------------------------- |
+| `lang`      | The language. ~~str~~                                                                           |
+| `tables`    | The tables to load. ~~List[str]~~                                                               |
+| **CREATES** | A function that loads the specified tables from the lookups registry. ~~Callable[[], Lookups]~~ |
+
+### spacy.LookupsDataLoaderFromURL.v1 {id="lookups_data_reader_from_url",tag="registered function",version="4"}
+
+> #### Example config
+>
+> ```ini
+> [initialize.components.lemmatizer.lookups]
+> @misc = "spacy.LookupsDataLoaderFromURL.v1"
+> lang = ${nlp.lang}
+> url = "https://raw.githubusercontent.com/explosion/spacy-lookups-data/v1.0.3/spacy_lookups_data/data/"
+> tables = ["lemma_rules","lemma_exc","lemma_index"]
+> ```
+
+Load the specified tables from the provided URL. The individual tables are
+expected to have filenames in the format `{lang}_{table}.json` under the
+specified URL directory as in the
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data/spacy_lookups_data/data/)
+repository.
+
+| Name        | Description                                                                                 |
+| ----------- | ------------------------------------------------------------------------------------------- |
+| `lang`      | The language. ~~str~~                                                                       |
+| `url`       | The URL for the directory where the tables can be downloaded. ~~str~~                       |
+| `tables`    | The tables to load. ~~List[str]~~                                                           |
+| **CREATES** | A function that loads the specified tables from the provided URL. ~~Callable[[], Lookups]~~ |
+
 ## Training data and alignment {id="gold",source="spacy/training"}
 
 ### training.offsets_to_biluo_tags {id="offsets_to_biluo_tags",tag="function"}
diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx
index b8b4917f2b2..6faad1d6a0f 100644
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@@ -59,19 +59,18 @@ $ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
 ```
 
 spaCy also lets you install extra dependencies by specifying the following
-keywords in brackets, e.g. `spacy[ja]` or `spacy[lookups,transformers]` (with
+keywords in brackets, e.g. `spacy[ja]` or `spacy[apple,transformers]` (with
 multiple comma-separated extras). See the `[options.extras_require]` section in
 spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
 
 > #### Example
 >
 > ```bash
-> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
+> $ pip install %%SPACY_PKG_NAME[apple,transformers]%%SPACY_PKG_FLAGS
 > ```
 
 | Name             | Description                                                                                                                                                                                                                                                    |
 | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lookups`        | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
 | `transformers`   | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline.                                                                                    |
 | `cuda`, ...      | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options.                                                                                |
 | `apple`          | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1.                                                                                                                                               |
@@ -174,7 +173,7 @@ $ pip install --no-build-isolation --editable . # compile and install spaCy
 To install with extras:
 
 ```bash
-$ pip install --no-build-isolation --editable .[lookups,cuda102]
+$ pip install --no-build-isolation --editable .[ja,cuda102]
 ```
 
 How to install compilers and related build tools:
diff --git a/website/docs/usage/linguistic-features.mdx b/website/docs/usage/linguistic-features.mdx
index 21cedd1ef2c..26d1ad37962 100644
--- a/website/docs/usage/linguistic-features.mdx
+++ b/website/docs/usage/linguistic-features.mdx
@@ -148,11 +148,11 @@ component.
 
 </Infobox>
 
-The data for spaCy's lemmatizers is distributed in the package
+The data for spaCy's lemmatizers is distributed in the repository
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
 provided trained pipelines already include all the required tables, but if you
-are creating new pipelines, you'll probably want to install `spacy-lookups-data`
-to provide the data when the lemmatizer is initialized.
+are creating new pipelines, you can load data from the repository in the
+lemmatizer initialization.
 
 ### Lookup lemmatizer {id="lemmatizer-lookup"}
 
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 43e3a0eeb6c..f4e0a01e8ca 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -50,7 +50,6 @@ const QuickstartInstall = ({ id, title }) => {
     const pipExtras = [
         hardware === 'gpu' && (platform !== 'arm' || os === 'linux') && cuda,
         train && 'transformers',
-        train && 'lookups',
         apple && 'apple',
         ...modelExtras,
     ]
@@ -214,9 +213,6 @@ const QuickstartInstall = ({ id, title }) => {
             <QS config="train" package="conda" comment prompt={false}>
                 # packages only available via pip
             </QS>
-            <QS config="train" package="conda">
-                pip install spacy-lookups-data
-            </QS>
 
             {languages.map(({ code, models: modelOptions }) => {
                 const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]

From f7021e78758a6e1b0d9e204441323610330c95ff Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 1 Aug 2023 22:24:02 +0900
Subject: [PATCH 082/504] Accept multiple code files in all CLI commands
 (#12101)

* Add support for multiple code files to all relevant commands

Prior to this, only the package command supported multiple code files.

* Update docs

* Add debug data test, plus generic fixtures

One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.

* Add debug config test and restructure

The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.

* Use a more generic, parametrized test

* Add output arg for assemble and pretrain

Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.

* Add evaluate test and some cleanup

* Mark tests as slow

* Revert argument name change

* Apply suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Format API CLI docs

* isort

* Fix imports in tests

* isort

* Undo changes to package CLI help

* Fix python executable and lang code in test

* Fix executable in another test

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 spacy/cli/_util.py          |   7 ++
 spacy/cli/assemble.py       |   6 +-
 spacy/cli/debug_config.py   |   6 +-
 spacy/cli/debug_data.py     |   6 +-
 spacy/cli/evaluate.py       |   6 +-
 spacy/cli/package.py        |   2 +-
 spacy/cli/pretrain.py       |   6 +-
 spacy/cli/train.py          |   6 +-
 spacy/tests/test_cli_app.py | 206 ++++++++++++++++++++++++++++++++++++
 website/docs/api/cli.mdx    | 108 +++++++++----------
 10 files changed, 286 insertions(+), 73 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index b005accf91f..eed61119070 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -348,6 +348,13 @@ def show_validation_error(
         msg.fail("Config validation error", e, exits=1)
 
 
+def import_code_paths(code_paths: str) -> None:
+    """Helper to import comma-separated list of code paths."""
+    code_paths = [Path(p.strip()) for p in string_to_list(code_paths)]
+    for code_path in code_paths:
+        import_code(code_path)
+
+
 def import_code(code_path: Optional[Union[Path, str]]) -> None:
     """Helper to import Python file provided in training commands / commands
     using the config. This makes custom registered functions available.
diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py
index f74bbacb555..7ad0f52fe1d 100644
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@@ -11,7 +11,7 @@
     Arg,
     Opt,
     app,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     show_validation_error,
 )
@@ -26,7 +26,7 @@ def assemble_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
     output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     # fmt: on
 ):
@@ -46,7 +46,7 @@ def assemble_cli(
     if not config_path or (str(config_path) != "-" and not config_path.exists()):
         msg.fail("Config file not found", config_path, exits=1)
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides, interpolate=False)
     msg.divider("Initializing pipeline")
diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 0e5382cd956..7818b4087e7 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -13,7 +13,7 @@
     Arg,
     Opt,
     debug_cli,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     show_validation_error,
 )
@@ -27,7 +27,7 @@ def debug_config_cli(
     # fmt: off
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
     show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
     # fmt: on
@@ -44,7 +44,7 @@ def debug_config_cli(
     DOCS: https://spacy.io/api/cli#debug-config
     """
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     debug_config(
         config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars
     )
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 4c44a8c0e2b..714969be145 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -40,7 +40,7 @@
     _format_number,
     app,
     debug_cli,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     show_validation_error,
 )
@@ -72,7 +72,7 @@ def debug_data_cli(
     # fmt: off
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
     verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
     no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
@@ -92,7 +92,7 @@ def debug_data_cli(
             "--help for an overview of the other available debugging commands."
         )
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     debug_data(
         config_path,
         config_overrides=overrides,
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 2276ca6b0d4..c3527028e9d 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -10,7 +10,7 @@
 from ..scorer import Scorer
 from ..tokens import Doc
 from ..training import Corpus
-from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
+from ._util import Arg, Opt, app, benchmark_cli, import_code_paths, setup_gpu
 
 
 @benchmark_cli.command(
@@ -22,7 +22,7 @@ def evaluate_cli(
     model: str = Arg(..., help="Model name or path"),
     data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
     output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
     gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
@@ -43,7 +43,7 @@ def evaluate_cli(
 
     DOCS: https://spacy.io/api/cli#benchmark-accuracy
     """
-    import_code(code_path)
+    import_code_paths(code_path)
     evaluate(
         model,
         data_path,
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 9421199f111..06b503271af 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -23,7 +23,7 @@ def package_cli(
     # fmt: off
     input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
     output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
-    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
+    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be included in the package"),
     meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
     create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
     name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 446c40510df..73337a7ca98 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -11,7 +11,7 @@
     Arg,
     Opt,
     app,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     setup_gpu,
     show_validation_error,
@@ -27,7 +27,7 @@ def pretrain_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
     output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
@@ -56,7 +56,7 @@ def pretrain_cli(
     DOCS: https://spacy.io/api/cli#pretrain
     """
     config_overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
     setup_gpu(use_gpu)
     msg.info(f"Loading config from: {config_path}")
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index c72e13b2681..40934f546e2 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -13,7 +13,7 @@
     Arg,
     Opt,
     app,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     setup_gpu,
     show_validation_error,
@@ -28,7 +28,7 @@ def train_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
     output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
     # fmt: on
@@ -50,7 +50,7 @@ def train_cli(
     if verbose:
         util.logger.setLevel(logging.DEBUG)
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
 
 
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 32ca639b37d..f9c1a9d6579 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -1,4 +1,5 @@
 import os
+import subprocess
 import sys
 from pathlib import Path
 
@@ -6,6 +7,7 @@
 import srsly
 from typer.testing import CliRunner
 
+import spacy
 from spacy.cli._util import app, get_git_version
 from spacy.tokens import Doc, DocBin
 
@@ -47,6 +49,210 @@ def test_convert_auto_conflict():
         assert len(out_files) == 0
 
 
+NOOP_CONFIG = """
+[paths]
+train = null
+dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+seed = 0
+gpu_allocator = null
+
+[nlp]
+lang = "mul"
+pipeline = ["noop", "noop2"]
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+batch_size = 1000
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+[components]
+
+[components.noop]
+factory = "noop"
+
+[components.noop2]
+factory = "noop2"
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[training]
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1600
+max_epochs = 0
+max_steps = 100
+eval_frequency = 200
+frozen_components = []
+annotating_components = []
+dev_corpus = "corpora.dev"
+
+train_corpus = "corpora.train"
+before_to_disk = null
+before_update = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
+"""
+
+
+@pytest.fixture
+def data_paths():
+    nlp = spacy.blank("mul")
+    doc = nlp("ok")
+    with make_tempdir() as tdir:
+        db = DocBin()
+        # debug data will *fail* if there aren't enough docs
+        for ii in range(100):
+            db.add(doc)
+        fpath = tdir / "data.spacy"
+        db.to_disk(fpath)
+
+        args = [
+            "--paths.train",
+            str(fpath),
+            "--paths.dev",
+            str(fpath),
+        ]
+        yield args
+
+
+@pytest.fixture
+def code_paths():
+    noop_base = """
+from spacy.language import Language
+
+@Language.component("{}")
+def noop(doc):
+    return doc
+"""
+
+    with make_tempdir() as temp_d:
+        # write code files to load
+        paths = []
+        for ff in ["noop", "noop2"]:
+            pyfile = temp_d / f"{ff}.py"
+            pyfile.write_text(noop_base.format(ff))
+            paths.append(pyfile)
+
+        args = ["--code", ",".join([str(pp) for pp in paths])]
+        yield args
+
+
+@pytest.fixture
+def noop_config():
+    with make_tempdir() as temp_d:
+        cfg = temp_d / "config.cfg"
+        cfg.write_text(NOOP_CONFIG)
+
+        yield cfg
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "cmd",
+    ["debug config", "debug data", "train", "assemble"],
+)
+def test_multi_code(cmd, code_paths, data_paths, noop_config):
+    # check that it fails without the code arg
+    cmd = cmd.split()
+    output = ["."] if cmd[0] == "assemble" else []
+    cmd = [sys.executable, "-m", "spacy"] + cmd
+    result = subprocess.run([*cmd, str(noop_config), *output, *data_paths])
+    assert result.returncode == 1
+
+    # check that it succeeds with the code arg
+    result = subprocess.run([*cmd, str(noop_config), *output, *data_paths, *code_paths])
+    assert result.returncode == 0
+
+
+@pytest.mark.slow
+def test_multi_code_evaluate(code_paths, data_paths, noop_config):
+    # Evaluation requires a model, not a config, so this works differently from
+    # the other commands.
+
+    # Train a model to evaluate
+    cmd = f"{sys.executable} -m spacy train {noop_config} -o model".split()
+    result = subprocess.run([*cmd, *data_paths, *code_paths])
+    assert result.returncode == 0
+
+    # now do the evaluation
+
+    eval_data = data_paths[-1]
+    cmd = f"{sys.executable} -m spacy evaluate model/model-best {eval_data}".split()
+
+    # check that it fails without the code arg
+    result = subprocess.run(cmd)
+    assert result.returncode == 1
+
+    # check that it succeeds with the code arg
+    result = subprocess.run([*cmd, *code_paths])
+    assert result.returncode == 0
+
+
 def test_benchmark_accuracy_alias():
     # Verify that the `evaluate` alias works correctly.
     result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 3f91e1ff71e..765bcb8c675 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -176,15 +176,15 @@ validation error with more details.
 $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 ```
 
-| Name                   | Description                                                                                                                                                                          |
-| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `base_path`            | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~                                                            |
-| `output_file`          | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~                                                   |
-| `--code`, `-c`         | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                    |
-| `--diff`, `-D`         | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                                                                        |
-| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**            | Complete and auto-filled config file for training.                                                                                                                                   |
+| Name                   | Description                                                                                                                                                                                            |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `base_path`            | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~                                                                              |
+| `output_file`          | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~                                                                     |
+| `--code`, `-c`         | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                                      |
+| `--diff`, `-D`         | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                                                                                          |
+| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
+| **CREATES**            | Complete and auto-filled config file for training.                                                                                                                                                     |
 
 ### init fill-curated-transformer {id="init-fill-curated-transformer",version="3.7",tag="command"}
 
@@ -266,7 +266,7 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `output_path`     | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~                                                                                                               |
-| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--code`, `-c`    | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--verbose`, `-V` | Show more detailed messages for debugging purposes. ~~bool (flag)~~                                                                                                                                                |
 | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
@@ -491,7 +491,7 @@ File       /path/to/thinc/thinc/schedules.py (line 91)
 | Name                     | Description                                                                                                                                                                                                                    |
 | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`            | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~             |
-| `--code`, `-c`           | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                                           |
+| `--code`, `-c`           | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                         |
 | `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~                                             |
 | `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ |
 | `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                     |
@@ -676,7 +676,7 @@ will not be available.
 | Name                       | Description                                                                                                                                                                                                        |
 | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`              | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
-| `--code`, `-c`             | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--code`, `-c`             | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~                                                                                                                                                       |
 | `--verbose`, `-V`          | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                     |
 | `--no-format`, `-NF`       | Don't pretty-print the results. Use this if you want to write to a file. ~~bool (flag)~~                                                                                                                           |
@@ -1136,7 +1136,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                          |
-| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--code`, `-c`    | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                                                       |
 | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
@@ -1206,6 +1206,7 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
 | -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`                                      | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `output_dir`                                       | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                                                           |
+| `--code`, `-c`                                     | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--code`, `-c`                                     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
 | `--resume-path`, `-r`                              | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                                                          |
 | `--epoch-resume`, `-er`                            | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                                                |
@@ -1243,20 +1244,19 @@ skew. To render a sample of dependency parses in a HTML file using the
 $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] [--per-component] [--spans-key]
 ```
 
-| Name                                                 | Description                                                                                                                                                                          |
-| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                                              | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`                                          | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                            |
-| `--output`, `-o`                                     | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                                  |
-| `--code`, `-c` <Tag variant="new">3</Tag>            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gold-preproc`, `-G`                               | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--gpu-id`, `-g`                                     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                           |
-| `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~            |
-| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                           |
-| `--spans-key`, `-sk` <Tag variant="new">3.6.2</Tag>  | Spans key to use when evaluating `Doc.spans`. Defaults to `sc`. ~~str (option)~~                                                                                                     |
-| `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                            |
+| Name                                                 | Description                                                                                                                                                                                            |
+| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                              | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                             |
+| `data_path`                                          | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                                              |
+| `--output`, `-o`                                     | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                                                    |
+| `--code`, `-c` <Tag variant="new">3</Tag>            | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gold-preproc`, `-G`                               | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                                                |
+| `--gpu-id`, `-g`                                     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
+| `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                                             |
+| `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~                              |
+| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                                             |
+| `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
+| **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                                              |
 
 ### speed {id="benchmark-speed", version="3.5", tag="command"}
 
@@ -1302,19 +1302,19 @@ If you want to evaluate the pipeline on raw text only, make sure that the .spacy
 $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
 ```
 
-| Name                      | Description                                                                                                                                                                          |
-| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  |
-| `data_path`               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 |
-| `output-file`             | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
-| `--code`, `-c`            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--text-key`, `-tk`       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            |
-| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    |
-| `--gpu-id`, `-g`          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--batch-size`, `-b`      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                  |
-| `--n-process`, `-n`       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                         |
-| `--help`, `-h`            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             |
+| Name                      | Description                                                                                                                                                                                            |
+| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                    |
+| `data_path`               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                                   |
+| `output-file`             | Output `DocBin` path. ~~str (positional)~~                                                                                                                                                             |
+| `--code`, `-c`            | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--text-key`, `-tk`       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                                              |
+| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                                      |
+| `--gpu-id`, `-g`          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
+| `--batch-size`, `-b`      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                                    |
+| `--n-process`, `-n`       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                           |
+| `--help`, `-h`            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
+| **CREATES**               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                                               |
 
 ## find-threshold {id="find-threshold",version="3.5",tag="command"}
 
@@ -1341,19 +1341,19 @@ be provided.
 > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
 > ```
 
-| Name                    | Description                                                                                                                                                                          |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                 | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`             | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
-| `pipe_name`             | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
-| `threshold_key`         | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
-| `scores_key`            | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
-| `--n_trials`, `-n`      | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
-| `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gpu-id`, `-g`        | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--gold-preproc`, `-G`  | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+| Name                     | Description                                                                                                                                                                                            |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                  | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                             |
+| `data_path`              | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                                  |
+| `pipe_name`              | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                                           |
+| `threshold_key`          | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                                          |
+| `scores_key`             | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                                              |
+| `--n_trials`, `-n`       | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                                     |
+| `--code`, `-c`           | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gpu-id`, `-g`         | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
+| `--gold-preproc`, `-G`   | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                                                |
+| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                                       |
+| `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
 
 ## assemble {#assemble tag="command"}
 
@@ -1377,7 +1377,7 @@ $ python -m spacy assemble [config_path] [output_dir] [--code] [--verbose] [over
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `config_path`     | Path to the [config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `output_dir`      | Directory to store the final pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                   |
-| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions). ~~Optional[Path] \(option)~~                                                |
+| `--code`, `-c`    | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~        |
 | `--verbose`, `-V` | Show more detailed messages during processing. ~~bool (flag)~~                                                                                                                                                |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                    |
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.data ./data`. ~~Any (option/flag)~~                            |

From c25f02f8c661ca40b6710700b37058e947b11dc7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 9 Aug 2023 10:55:52 +0200
Subject: [PATCH 083/504] Switch zh tokenizer default pkuseg_model to
 spacy_ontonotes (#12896)

So that users can use `copy_from_base_model` for other segmenters
without having to override an irrelevant `pkuseg_model` setting, switch
the default `pkuseg_model` to `spacy_ontonotes`.
---
 spacy/lang/zh/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index f7bb092771c..6b980b52b61 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -31,7 +31,7 @@
 [initialize]
 
 [initialize.tokenizer]
-pkuseg_model = null
+pkuseg_model = "spacy_ontonotes"
 pkuseg_user_dict = "default"
 """
 

From e3eae4f4a5ce01b777c6b45b23c15f3ed9bf88e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 14:38:05 +0100
Subject: [PATCH 084/504] Revert "Reimplement distillation with oracle cut size
 (#12214)"

This reverts commit e27c60a70263f7ab17968964de37e938653e37a2.
---
 spacy/ml/tb_framework.pyx            |   4 +-
 spacy/pipeline/transition_parser.pyx | 105 ++++++---------------------
 spacy/tests/parser/test_model.py     |  61 ----------------
 spacy/tests/parser/test_ner.py       |   5 +-
 spacy/tests/parser/test_parse.py     |   5 +-
 5 files changed, 24 insertions(+), 156 deletions(-)
 delete mode 100644 spacy/tests/parser/test_model.py

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 6c5c29d8549..e497643f0cd 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -267,11 +267,9 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
     cdef np.ndarray step_actions
 
     scores = []
-    while sizes.states >= 1 and (actions is None or len(actions) > 0):
+    while sizes.states >= 1:
         step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
         step_actions = actions[0] if actions is not None else None
-        assert step_actions is None or step_actions.size == sizes.states, \
-            f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
         with nogil:
             _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
             if actions is None:
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 9fa0d4987b8..99970b3fe93 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -49,10 +49,6 @@ cdef extern from "<algorithm>" namespace "std" nogil:
 
 
 
-# TODO: Remove when we switch to Cython 3.
-cdef extern from "<algorithm>" namespace "std" nogil:
-    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
-
 NUMPY_OPS = NumpyOps()
 
 
@@ -271,8 +267,8 @@ class Parser(TrainablePipe):
             # batch uniform length. Since we do not have a gold standard
             # sequence, we use the teacher's predictions as the gold
             # standard.
-            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
-            states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves)
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            states = self._init_batch(teacher_pipe, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
@@ -283,14 +279,12 @@ class Parser(TrainablePipe):
         # gradients of the student's transition distributions relative to the
         # teacher's distributions.
 
-        student_inputs = TransitionModelInputs(docs=student_docs,
-                                               states=[state.copy() for state in states],
-                                               moves=self.moves,
-                                               max_moves=max_moves)
+        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
+            max_moves=max_moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = _states_diff_to_actions(states, student_states)
+        actions = states2actions(student_states)
         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-                                               states=states, moves=teacher_pipe.moves, actions=actions)
+            moves=self.moves, actions=actions)
         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@@ -538,7 +532,7 @@ class Parser(TrainablePipe):
         set_dropout_rate(self.model, 0.0)
         student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = _states_to_actions(student_states)
+        actions = states2actions(student_states)
         teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
         _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
 
@@ -658,7 +652,7 @@ class Parser(TrainablePipe):
                     raise ValueError(Errors.E149) from None
         return self
 
-    def _init_batch_from_teacher(self, teacher_pipe, docs, max_length):
+    def _init_batch(self, teacher_step_model, docs, max_length):
         """Make a square batch of length equal to the shortest transition
         sequence or a cap. A long
         doc will get multiple states. Let's say we have a doc of length 2*N,
@@ -667,12 +661,10 @@ class Parser(TrainablePipe):
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
         cdef:
+            StateClass start_state
             StateClass state
-            TransitionSystem moves = teacher_pipe.moves
-
-        # Start with the same heuristic as in supervised training: exclude
-        # docs that are within the maximum length.
-        all_states = moves.init_batch(docs)
+            Transition action
+        all_states = self.moves.init_batch(docs)
         states = []
         to_cut = []
         for state, doc in zip(all_states, docs):
@@ -681,30 +673,19 @@ class Parser(TrainablePipe):
                     states.append(state)
                 else:
                     to_cut.append(state)
-
-        if not to_cut:
-            return states
-
-        # Parse the states that are too long with the teacher's parsing model.
-        teacher_inputs = TransitionModelInputs(docs=docs,
-                                               moves=moves,
-                                               states=[state.copy() for state in to_cut])
-        (teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)
-
-        # Step through the teacher's actions and store every state after
-        # each multiple of max_length.
-        teacher_actions = _states_to_actions(teacher_states)
         while to_cut:
             states.extend(state.copy() for state in to_cut)
-            for step_actions in teacher_actions[:max_length]:
-                to_cut = moves.apply_actions(to_cut, step_actions)
-            teacher_actions = teacher_actions[max_length:]
-
-            if len(teacher_actions) < max_length:
-                break
-
+            # Move states forward max_length actions.
+            length = 0
+            while to_cut and length < max_length:
+                teacher_scores = teacher_step_model.predict(to_cut)
+                self.transition_states(to_cut, teacher_scores)
+                # States that are completed do not need further cutting.
+                to_cut = [state for state in to_cut if not state.is_final()]
+                length += 1
         return states
 
+
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
         sequence or a cap. A long doc will get multiple states. Let's say we
@@ -765,7 +746,7 @@ def _change_attrs(model, **kwargs):
             model.attrs[key] = value
 
 
-def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
+def states2actions(states: List[StateClass]) -> List[Ints1d]:
     cdef int step
     cdef StateClass state
     cdef StateC* c_state
@@ -786,47 +767,3 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
         actions.append(numpy.array(step_actions, dtype="i"))
 
     return actions
-
-
-def _states_diff_to_actions(
-    before_states: List[StateClass],
-    after_states: List[StateClass]
-) -> List[Ints1d]:
-    """
-    Return for two sets of states the actions to go from the first set of
-    states to the second set of states. The histories of the first set of
-    states must be a prefix of the second set of states.
-    """
-    cdef StateClass before_state, after_state
-    cdef StateC* c_state_before
-    cdef StateC* c_state_after
-
-    assert len(before_states) == len(after_states)
-
-    # Check invariant: before states histories must be prefixes of after states.
-    for before_state, after_state in zip(before_states, after_states):
-        c_state_before = before_state.c
-        c_state_after = after_state.c
-
-        assert equal(c_state_before.history.begin(),
-                     c_state_before.history.end(),
-                     c_state_after.history.begin())
-
-    actions = []
-    while True:
-        step = len(actions)
-
-        step_actions = []
-        for before_state, after_state in zip(before_states, after_states):
-            c_state_before = before_state.c
-            c_state_after = after_state.c
-            if step < c_state_after.history.size() - c_state_before.history.size():
-                step_actions.append(c_state_after.history[c_state_before.history.size() + step])
-
-        # We are done if we have exhausted all histories.
-        if len(step_actions) == 0:
-            break
-
-        actions.append(numpy.array(step_actions, dtype="i"))
-
-    return actions
diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py
deleted file mode 100644
index 8c1cf7a9346..00000000000
--- a/spacy/tests/parser/test_model.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import numpy
-import pytest
-
-from spacy.lang.en import English
-from spacy.ml.tb_framework import TransitionModelInputs
-from spacy.training import Example
-
-TRAIN_DATA = [
-    (
-        "They trade mortgage-backed securities.",
-        {
-            "heads": [1, 1, 4, 4, 5, 1, 1],
-            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
-        },
-    ),
-    (
-        "I like London and Berlin.",
-        {
-            "heads": [1, 1, 1, 2, 2, 1],
-            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
-        },
-    ),
-]
-
-
-@pytest.fixture
-def nlp_parser():
-    nlp = English()
-    parser = nlp.add_pipe("parser")
-
-    train_examples = []
-    for text, annotations in TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-        for dep in annotations["deps"]:
-            parser.add_label(dep)
-    nlp.initialize()
-
-    return nlp, parser
-
-
-def test_incorrect_number_of_actions(nlp_parser):
-    nlp, parser = nlp_parser
-    doc = nlp.make_doc("test")
-
-    # Too many actions for the number of docs
-    with pytest.raises(AssertionError):
-        parser.model.predict(
-            TransitionModelInputs(
-                docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
-            )
-        )
-
-    # Too few actions for the number of docs
-    with pytest.raises(AssertionError):
-        parser.model.predict(
-            TransitionModelInputs(
-                docs=[doc, doc],
-                moves=parser.moves,
-                actions=[numpy.array([0], dtype="i")],
-            )
-        )
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index f0efc3a63fd..bb9b7653ce3 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -623,9 +623,7 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-@pytest.mark.slow
-@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
-def test_distill(max_moves):
+def test_distill():
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -643,7 +641,6 @@ def test_distill(max_moves):
 
     student = English()
     student_ner = student.add_pipe("ner")
-    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 636bb887789..d25eb165acb 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -462,9 +462,7 @@ def test_is_distillable():
     assert parser.is_distillable
 
 
-@pytest.mark.slow
-@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
-def test_distill(max_moves):
+def test_distill():
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []
@@ -482,7 +480,6 @@ def test_distill(max_moves):
 
     student = English()
     student_parser = student.add_pipe("parser")
-    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
     student_parser.initialize(
         get_examples=lambda: train_examples, labels=teacher_parser.label_data
     )

From 905b2b901162801fb4b40537f672b92ec9a02b67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:23:08 +0100
Subject: [PATCH 085/504] Revert "Merge the parser refactor into `v4` (#10940)"

This reverts commit a183db3cefe0713e828868b43daf61c464cae05c.
---
 setup.py                                      |   6 +-
 spacy/cli/templates/quickstart_training.jinja |  12 +-
 spacy/compat.py                               |   5 +
 spacy/errors.py                               |   7 +-
 spacy/ml/_precomputable_affine.py             | 164 +++++
 spacy/ml/models/parser.py                     | 177 +++--
 spacy/ml/parser_model.pxd                     |  49 ++
 spacy/ml/parser_model.pyx                     | 500 ++++++++++++++
 spacy/ml/tb_framework.pxd                     |  28 -
 spacy/ml/tb_framework.py                      |  50 ++
 spacy/ml/tb_framework.pyx                     | 639 ------------------
 .../_parser_internals/_beam_utils.pyx         |   4 +-
 .../_parser_internals/_parser_utils.pxd       |   2 -
 .../_parser_internals/_parser_utils.pyx       |  22 -
 spacy/pipeline/_parser_internals/_state.pxd   |  60 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 -
 spacy/pipeline/_parser_internals/batch.pxd    |   2 -
 spacy/pipeline/_parser_internals/batch.pyx    |  52 --
 spacy/pipeline/_parser_internals/ner.pyx      |   6 +-
 .../pipeline/_parser_internals/stateclass.pyx |   7 -
 .../_parser_internals/transition_system.pxd   |   7 -
 .../_parser_internals/transition_system.pyx   |  73 --
 .../{dep_parser.py => dep_parser.pyx}         |  20 +-
 spacy/pipeline/{ner.py => ner.pyx}            |  35 +-
 spacy/pipeline/transition_parser.pxd          |  21 +
 spacy/pipeline/transition_parser.pyx          | 504 ++++++++------
 spacy/tests/parser/test_ner.py                |  10 +-
 spacy/tests/parser/test_parse.py              |  74 +-
 spacy/tests/pipeline/test_tok2vec.py          |   2 +-
 .../tests/serialize/test_serialize_config.py  |  56 +-
 spacy/tests/test_misc.py                      |  55 +-
 spacy/training/example.pyx                    |   2 +-
 website/docs/api/architectures.mdx            |  26 +-
 website/docs/api/cli.mdx                      |   8 +-
 website/docs/api/legacy.mdx                   |   2 +-
 .../docs/usage/embeddings-transformers.mdx    |   6 +-
 36 files changed, 1384 insertions(+), 1312 deletions(-)
 create mode 100644 spacy/ml/_precomputable_affine.py
 create mode 100644 spacy/ml/parser_model.pxd
 create mode 100644 spacy/ml/parser_model.pyx
 delete mode 100644 spacy/ml/tb_framework.pxd
 create mode 100644 spacy/ml/tb_framework.py
 delete mode 100644 spacy/ml/tb_framework.pyx
 delete mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pxd
 delete mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pyx
 delete mode 100644 spacy/pipeline/_parser_internals/batch.pxd
 delete mode 100644 spacy/pipeline/_parser_internals/batch.pyx
 rename spacy/pipeline/{dep_parser.py => dep_parser.pyx} (96%)
 rename spacy/pipeline/{ner.py => ner.pyx} (93%)
 create mode 100644 spacy/pipeline/transition_parser.pxd

diff --git a/setup.py b/setup.py
index a4a87d68aec..0eb529c2098 100755
--- a/setup.py
+++ b/setup.py
@@ -32,10 +32,12 @@
     "spacy.kb.candidate",
     "spacy.kb.kb",
     "spacy.kb.kb_in_memory",
-    "spacy.ml.tb_framework",
+    "spacy.ml.parser_model",
     "spacy.morphology",
+    "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
+    "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
     "spacy.pipeline.sentencizer",
@@ -43,7 +45,6 @@
     "spacy.pipeline.tagger",
     "spacy.pipeline.transition_parser",
     "spacy.pipeline._parser_internals.arc_eager",
-    "spacy.pipeline._parser_internals.batch",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
     "spacy.pipeline._parser_internals.search",
@@ -51,7 +52,6 @@
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.pipeline._parser_internals._beam_utils",
-    "spacy.pipeline._parser_internals._parser_utils",
     "spacy.tokenizer",
     "spacy.training.align",
     "spacy.training.gold_io",
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 36325711d4d..2817147f3e9 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -90,11 +90,12 @@ grad_factor = 1.0
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
+use_upper = false
 nO = null
 
 [components.parser.model.tok2vec]
@@ -110,11 +111,12 @@ grad_factor = 1.0
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = false
 nO = null
 
 [components.ner.model.tok2vec]
@@ -385,11 +387,12 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
+use_upper = true
 nO = null
 
 [components.parser.model.tok2vec]
@@ -402,11 +405,12 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
diff --git a/spacy/compat.py b/spacy/compat.py
index 1e63807a0e8..30459e2e495 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,6 +23,11 @@
 except ImportError:
     cupy = None
 
+if sys.version_info[:2] >= (3, 8):  # Python 3.8+
+    from typing import Literal, Protocol, runtime_checkable
+else:
+    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
+
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/errors.py b/spacy/errors.py
index adca5880283..a5d0b3d11a9 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -217,12 +217,6 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
-    # v4 warning strings
-    W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
-    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
-            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
-            "to return `True` in `.supports_prior_probs`.")
-
 
 class Errors(metaclass=ErrorsWithCodes):
     E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
@@ -1007,6 +1001,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
+
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
 # fmt: on
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
new file mode 100644
index 00000000000..1c20c622b2c
--- /dev/null
+++ b/spacy/ml/_precomputable_affine.py
@@ -0,0 +1,164 @@
+from thinc.api import Model, normal_init
+
+from ..util import registry
+
+
+@registry.layers("spacy.PrecomputableAffine.v1")
+def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
+    model = Model(
+        "precomputable_affine",
+        forward,
+        init=init,
+        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
+        params={"W": None, "b": None, "pad": None},
+        attrs={"dropout_rate": dropout},
+    )
+    return model
+
+
+def forward(model, X, is_train):
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.get_param("W")
+    # Preallocate array for layer output, including padding.
+    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
+    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
+    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
+
+    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
+    # change its shape to (nF, nO, nP) without breaking existing models. So
+    # we'll squeeze the first dimension here.
+    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
+
+    def backward(dY_ids):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nO, nP), and get back:
+        # (nB, nO, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nO, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        assert dY.ndim == 3
+        assert dY.shape[1] == nO, dY.shape
+        assert dY.shape[2] == nP, dY.shape
+        # nB = dY.shape[0]
+        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
+        Xf = X[ids]
+        Xf = Xf.reshape((Xf.shape[0], nF * nI))
+
+        model.inc_grad("b", dY.sum(axis=0))
+        dY = dY.reshape((dY.shape[0], nO * nP))
+
+        Wopfi = W.transpose((1, 2, 0, 3))
+        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
+        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
+
+        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
+        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
+        # (o, p, f, i) --> (f, o, p, i)
+        dWopfi = dWopfi.transpose((2, 0, 1, 3))
+        model.inc_grad("W", dWopfi)
+        return dXf.reshape((dXf.shape[0], nF, nI))
+
+    return Yf, backward
+
+
+def _backprop_precomputable_affine_padding(model, dY, ids):
+    nB = dY.shape[0]
+    nF = model.get_dim("nF")
+    nP = model.get_dim("nP")
+    nO = model.get_dim("nO")
+    # Backprop the "padding", used as a filler for missing values.
+    # Values that are missing are set to -1, and each state vector could
+    # have multiple missing values. The padding has different values for
+    # different missing features. The gradient of the padding vector is:
+    #
+    # for b in range(nB):
+    #     for f in range(nF):
+    #         if ids[b, f] < 0:
+    #             d_pad[f] += dY[b]
+    #
+    # Which can be rewritten as:
+    #
+    # (ids < 0).T @ dY
+    mask = model.ops.asarray(ids < 0, dtype="f")
+    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
+    return d_pad.reshape((1, nF, nO, nP))
+
+
+def init(model, X=None, Y=None):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    if model.has_param("W") and model.get_param("W").any():
+        return
+
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nO, nP, nI)
+    b = model.ops.alloc2f(nO, nP)
+    pad = model.ops.alloc4f(1, nF, nO, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
+        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors = vectors.reshape((vectors.shape[0], nO, nP))
+        vectors += b
+        vectors = model.ops.asarray(vectors)
+        if nP >= 2:
+            return model.ops.maxout(vectors)[0]
+        else:
+            return vectors * (vectors >= 0)
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = model.get_param("W").copy()
+    b = model.get_param("b").copy()
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("b", b)
+        else:
+            break
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 422abf4e260..a70d84dea8f 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,66 +1,23 @@
-import warnings
-from typing import Any, List, Literal, Optional, Tuple
-
-from thinc.api import Model
+from typing import Optional, List, cast
+from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
 from thinc.types import Floats2d
 
-from ...errors import Errors, Warnings
-from ...tokens.doc import Doc
+from ...errors import Errors
+from ...compat import Literal
 from ...util import registry
+from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
-
-TransitionSystem = Any  # TODO
-State = Any  # TODO
-
-
-@registry.architectures.register("spacy.TransitionBasedParser.v2")
-def transition_parser_v2(
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    state_type: Literal["parser", "ner"],
-    extra_state_tokens: bool,
-    hidden_width: int,
-    maxout_pieces: int,
-    use_upper: bool,
-    nO: Optional[int] = None,
-) -> Model:
-    if not use_upper:
-        warnings.warn(Warnings.W400)
-
-    return build_tb_parser_model(
-        tok2vec,
-        state_type,
-        extra_state_tokens,
-        hidden_width,
-        maxout_pieces,
-        nO=nO,
-    )
-
-
-@registry.architectures.register("spacy.TransitionBasedParser.v3")
-def transition_parser_v3(
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    state_type: Literal["parser", "ner"],
-    extra_state_tokens: bool,
-    hidden_width: int,
-    maxout_pieces: int,
-    nO: Optional[int] = None,
-) -> Model:
-    return build_tb_parser_model(
-        tok2vec,
-        state_type,
-        extra_state_tokens,
-        hidden_width,
-        maxout_pieces,
-        nO=nO,
-    )
+from ...tokens import Doc
 
 
+@registry.architectures("spacy.TransitionBasedParser.v2")
 def build_tb_parser_model(
     tok2vec: Model[List[Doc], List[Floats2d]],
     state_type: Literal["parser", "ner"],
     extra_state_tokens: bool,
     hidden_width: int,
     maxout_pieces: int,
+    use_upper: bool,
     nO: Optional[int] = None,
 ) -> Model:
     """
@@ -94,7 +51,14 @@ def build_tb_parser_model(
         feature sets (for the NER) or 13 (for the parser).
     hidden_width (int): The width of the hidden layer.
     maxout_pieces (int): How many pieces to use in the state prediction layer.
-        Recommended values are 1, 2 or 3.
+        Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
+        is replaced with a ReLu non-linearity if use_upper=True, and no
+        non-linearity if use_upper=False.
+    use_upper (bool): Whether to use an additional hidden layer after the state
+        vector in order to predict the action scores. It is recommended to set
+        this to False for large pretrained models such as transformers, and True
+        for smaller networks. The upper layer is computed on CPU, which becomes
+        a bottleneck on larger GPU-based models, where it's also less necessary.
     nO (int or None): The number of actions the model will predict between.
         Usually inferred from data at the beginning of training, or loaded from
         disk.
@@ -105,11 +69,106 @@ def build_tb_parser_model(
         nr_feature_tokens = 6 if extra_state_tokens else 3
     else:
         raise ValueError(Errors.E917.format(value=state_type))
-    return TransitionModel(
-        tok2vec=tok2vec,
-        state_tokens=nr_feature_tokens,
-        hidden_width=hidden_width,
-        maxout_pieces=maxout_pieces,
-        nO=nO,
-        unseen_classes=set(),
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+    tok2vec = chain(
+        tok2vec,
+        list2array(),
+        Linear(hidden_width, t2v_width),
     )
+    tok2vec.set_dim("nO", hidden_width)
+    lower = _define_lower(
+        nO=hidden_width if use_upper else nO,
+        nF=nr_feature_tokens,
+        nI=tok2vec.get_dim("nO"),
+        nP=maxout_pieces,
+    )
+    upper = None
+    if use_upper:
+        with use_ops("cpu"):
+            # Initialize weights at zero, as it's a classification layer.
+            upper = _define_upper(nO=nO, nI=None)
+    return TransitionModel(tok2vec, lower, upper, resize_output)
+
+
+def _define_upper(nO, nI):
+    return Linear(nO=nO, nI=nI, init_W=zero_init)
+
+
+def _define_lower(nO, nF, nI, nP):
+    return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
+
+
+def resize_output(model, new_nO):
+    if model.attrs["has_upper"]:
+        return _resize_upper(model, new_nO)
+    return _resize_lower(model, new_nO)
+
+
+def _resize_upper(model, new_nO):
+    upper = model.get_ref("upper")
+    if upper.has_dim("nO") is None:
+        upper.set_dim("nO", new_nO)
+        return model
+    elif new_nO == upper.get_dim("nO"):
+        return model
+
+    smaller = upper
+    nI = smaller.maybe_get_dim("nI")
+    with use_ops("cpu"):
+        larger = _define_upper(nO=new_nO, nI=nI)
+    # it could be that the model is not initialized yet, then skip this bit
+    if smaller.has_param("W"):
+        larger_W = larger.ops.alloc2f(new_nO, nI)
+        larger_b = larger.ops.alloc1f(new_nO)
+        smaller_W = smaller.get_param("W")
+        smaller_b = smaller.get_param("b")
+        # Weights are stored in (nr_out, nr_in) format, so we're basically
+        # just adding rows here.
+        if smaller.has_dim("nO"):
+            old_nO = smaller.get_dim("nO")
+            larger_W[:old_nO] = smaller_W
+            larger_b[:old_nO] = smaller_b
+            for i in range(old_nO, new_nO):
+                model.attrs["unseen_classes"].add(i)
+
+        larger.set_param("W", larger_W)
+        larger.set_param("b", larger_b)
+    model._layers[-1] = larger
+    model.set_ref("upper", larger)
+    return model
+
+
+def _resize_lower(model, new_nO):
+    lower = model.get_ref("lower")
+    if lower.has_dim("nO") is None:
+        lower.set_dim("nO", new_nO)
+        return model
+
+    smaller = lower
+    nI = smaller.maybe_get_dim("nI")
+    nF = smaller.maybe_get_dim("nF")
+    nP = smaller.maybe_get_dim("nP")
+    larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
+    # it could be that the model is not initialized yet, then skip this bit
+    if smaller.has_param("W"):
+        larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
+        larger_b = larger.ops.alloc2f(new_nO, nP)
+        larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
+        smaller_W = smaller.get_param("W")
+        smaller_b = smaller.get_param("b")
+        smaller_pad = smaller.get_param("pad")
+        # Copy the old weights and padding into the new layer
+        if smaller.has_dim("nO"):
+            old_nO = smaller.get_dim("nO")
+            larger_W[:, 0:old_nO, :, :] = smaller_W
+            larger_pad[:, :, 0:old_nO, :] = smaller_pad
+            larger_b[0:old_nO, :] = smaller_b
+            for i in range(old_nO, new_nO):
+                model.attrs["unseen_classes"].add(i)
+
+        larger.set_param("W", larger_W)
+        larger.set_param("b", larger_b)
+        larger.set_param("pad", larger_pad)
+    model._layers[1] = larger
+    model.set_ref("lower", larger)
+    return model
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
new file mode 100644
index 00000000000..8def6cea53f
--- /dev/null
+++ b/spacy/ml/parser_model.pxd
@@ -0,0 +1,49 @@
+from libc.string cimport memset, memcpy
+from thinc.backends.cblas cimport CBlas
+from ..typedefs cimport weight_t, hash_t
+from ..pipeline._parser_internals._state cimport StateC
+
+
+cdef struct SizesC:
+    int states
+    int classes
+    int hiddens
+    int pieces
+    int feats
+    int embed_width
+
+
+cdef struct WeightsC:
+    const float* feat_weights
+    const float* feat_bias
+    const float* hidden_bias
+    const float* hidden_weights
+    const float* seen_classes
+
+
+cdef struct ActivationsC:
+    int* token_ids
+    float* unmaxed
+    float* scores
+    float* hiddens
+    int* is_valid
+    int _curr_size
+    int _max_size
+
+
+cdef WeightsC get_c_weights(model) except *
+
+cdef SizesC get_c_sizes(model, int batch_size) except *
+
+cdef ActivationsC alloc_activations(SizesC n) nogil
+
+cdef void free_activations(const ActivationsC* A) nogil
+
+cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
+        const WeightsC* W, SizesC n) nogil
+ 
+cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
+
+cdef void cpu_log_loss(float* d_scores,
+        const float* costs, const int* is_valid, const float* scores, int O) nogil
+ 
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
new file mode 100644
index 00000000000..91558683b60
--- /dev/null
+++ b/spacy/ml/parser_model.pyx
@@ -0,0 +1,500 @@
+# cython: infer_types=True, cdivision=True, boundscheck=False
+cimport numpy as np
+from libc.math cimport exp
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from thinc.backends.cblas cimport saxpy, sgemm
+
+import numpy
+import numpy.random
+from thinc.api import Model, CupyOps, NumpyOps, get_ops
+
+from .. import util
+from ..errors import Errors
+from ..typedefs cimport weight_t, class_t, hash_t
+from ..pipeline._parser_internals.stateclass cimport StateClass
+
+
+cdef WeightsC get_c_weights(model) except *:
+    cdef WeightsC output
+    cdef precompute_hiddens state2vec = model.state2vec
+    output.feat_weights = state2vec.get_feat_weights()
+    output.feat_bias = <const float*>state2vec.bias.data
+    cdef np.ndarray vec2scores_W
+    cdef np.ndarray vec2scores_b
+    if model.vec2scores is None:
+        output.hidden_weights = NULL
+        output.hidden_bias = NULL
+    else:
+        vec2scores_W = model.vec2scores.get_param("W")
+        vec2scores_b = model.vec2scores.get_param("b")
+        output.hidden_weights = <const float*>vec2scores_W.data
+        output.hidden_bias = <const float*>vec2scores_b.data
+    cdef np.ndarray class_mask = model._class_mask
+    output.seen_classes = <const float*>class_mask.data
+    return output
+
+
+cdef SizesC get_c_sizes(model, int batch_size) except *:
+    cdef SizesC output
+    output.states = batch_size
+    if model.vec2scores is None:
+        output.classes = model.state2vec.get_dim("nO")
+    else:
+        output.classes = model.vec2scores.get_dim("nO")
+    output.hiddens = model.state2vec.get_dim("nO")
+    output.pieces = model.state2vec.get_dim("nP")
+    output.feats = model.state2vec.get_dim("nF")
+    output.embed_width = model.tokvecs.shape[1]
+    return output
+
+
+cdef ActivationsC alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    resize_activations(&A, n)
+    return A
+
+
+cdef void free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.scores)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
+cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
+    if n.states <= A._max_size:
+        A._curr_size = n.states
+        return
+    if A._max_size == 0:
+        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
+        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
+        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
+        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    else:
+        A.token_ids = <int*>realloc(A.token_ids,
+            n.states * n.feats * sizeof(A.token_ids[0]))
+        A.scores = <float*>realloc(A.scores,
+            n.states * n.classes * sizeof(A.scores[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,
+            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>realloc(A.hiddens,
+            n.states * n.hiddens * sizeof(A.hiddens[0]))
+        A.is_valid = <int*>realloc(A.is_valid,
+            n.states * n.classes * sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    A._curr_size = n.states
+
+
+cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
+        const WeightsC* W, SizesC n) nogil:
+    cdef double one = 1.0
+    resize_activations(A, n)
+    for i in range(n.states):
+        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
+    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
+    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
+    sum_state_features(cblas, A.unmaxed,
+        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
+    for i in range(n.states):
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        for j in range(n.hiddens):
+            index = i * n.hiddens * n.pieces + j * n.pieces
+            which = _arg_max(&A.unmaxed[index], n.pieces)
+            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
+    memset(A.scores, 0, n.states * n.classes * sizeof(float))
+    if W.hidden_weights == NULL:
+        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
+    else:
+        # Compute hidden-to-output
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
+            1.0, <const float *>A.hiddens, n.hiddens,
+            <const float *>W.hidden_weights, n.hiddens,
+            0.0, A.scores, n.classes)
+        # Add bias
+        for i in range(n.states):
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
+    # Set unseen classes to minimum value
+    i = 0
+    min_ = A.scores[0]
+    for i in range(1, n.states * n.classes):
+        if A.scores[i] < min_:
+            min_ = A.scores[i]
+    for i in range(n.states):
+        for j in range(n.classes):
+            if not W.seen_classes[j]:
+                A.scores[i*n.classes+j] = min_
+
+
+cdef void sum_state_features(CBlas cblas, float* output,
+        const float* cached, const int* token_ids, int B, int F, int O) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    padding = cached
+    cached += F * O
+    cdef int id_stride = F*O
+    cdef float one = 1.
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                feature = &padding[f*O]
+            else:
+                idx = token_ids[f] * id_stride + f*O
+                feature = &cached[idx]
+            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
+        token_ids += F
+
+
+cdef void cpu_log_loss(float* d_scores,
+        const float* costs, const int* is_valid, const float* scores,
+        int O) nogil:
+    """Do multi-label log loss"""
+    cdef double max_, gmax, Z, gZ
+    best = arg_max_if_gold(scores, costs, is_valid, O)
+    guess = _arg_max(scores, O)
+
+    if best == -1 or guess == -1:
+        # These shouldn't happen, but if they do, we want to make sure we don't
+        # cause an OOB access.
+        return
+    Z = 1e-10
+    gZ = 1e-10
+    max_ = scores[guess]
+    gmax = scores[best]
+    for i in range(O):
+        Z += exp(scores[i] - max_)
+        if costs[i] <= costs[best]:
+            gZ += exp(scores[i] - gmax)
+    for i in range(O):
+        if costs[i] <= costs[best]:
+            d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
+        else:
+            d_scores[i] = exp(scores[i]-max_) / Z
+
+
+cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
+        const int* is_valid, int n) nogil:
+    # Find minimum cost
+    cdef float cost = 1
+    for i in range(n):
+        if is_valid[i] and costs[i] < cost:
+            cost = costs[i]
+    # Now find best-scoring with that cost
+    cdef int best = -1
+    for i in range(n):
+        if costs[i] <= cost and is_valid[i]:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
+
+
+cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if is_valid[i] >= 1:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
+
+
+
+class ParserStepModel(Model):
+    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
+            dropout=0.1):
+        Model.__init__(self, name="parser_step_model", forward=step_forward)
+        self.attrs["has_upper"] = has_upper
+        self.attrs["dropout_rate"] = dropout
+        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
+        if layers[1].get_dim("nP") >= 2:
+            activation = "maxout"
+        elif has_upper:
+            activation = None
+        else:
+            activation = "relu"
+        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
+                                            activation=activation, train=train)
+        if has_upper:
+            self.vec2scores = layers[-1]
+        else:
+            self.vec2scores = None
+        self.cuda_stream = util.get_cuda_stream(non_blocking=True)
+        self.backprops = []
+        self._class_mask = numpy.zeros((self.nO,), dtype='f')
+        self._class_mask.fill(1)
+        if unseen_classes is not None:
+            for class_ in unseen_classes:
+                self._class_mask[class_] = 0.
+
+    def clear_memory(self):
+        del self.tokvecs
+        del self.bp_tokvecs
+        del self.state2vec
+        del self.backprops
+        del self._class_mask
+
+    @property
+    def nO(self):
+        if self.attrs["has_upper"]:
+            return self.vec2scores.get_dim("nO")
+        else:
+            return self.state2vec.get_dim("nO")
+
+    def class_is_unseen(self, class_):
+        return self._class_mask[class_]
+
+    def mark_class_unseen(self, class_):
+        self._class_mask[class_] = 0
+
+    def mark_class_seen(self, class_):
+        self._class_mask[class_] = 1
+
+    def get_token_ids(self, states):
+        cdef StateClass state
+        states = [state for state in states if not state.is_final()]
+        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
+                                          dtype='i', order='C')
+        ids.fill(-1)
+        c_ids = <int*>ids.data
+        for state in states:
+            state.c.set_context_tokens(c_ids, ids.shape[1])
+            c_ids += ids.shape[1]
+        return ids
+
+    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
+        if isinstance(self.state2vec.ops, CupyOps) \
+        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+            # Move token_ids and d_vector to GPU, asynchronously
+            self.backprops.append((
+                util.get_async(self.cuda_stream, token_ids),
+                util.get_async(self.cuda_stream, d_vector),
+                get_d_tokvecs
+            ))
+        else:
+            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
+
+
+    def finish_steps(self, golds):
+        # Add a padding vector to the d_tokvecs gradient, so that missing
+        # values don't affect the real gradient.
+        d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
+        # Tells CUDA to block, so our async copies complete.
+        if self.cuda_stream is not None:
+            self.cuda_stream.synchronize()
+        for ids, d_vector, bp_vector in self.backprops:
+            d_state_features = bp_vector((d_vector, ids))
+            ids = ids.flatten()
+            d_state_features = d_state_features.reshape(
+                (ids.size, d_state_features.shape[2]))
+            self.ops.scatter_add(d_tokvecs, ids,
+                d_state_features)
+        # Padded -- see update()
+        self.bp_tokvecs(d_tokvecs[:-1])
+        return d_tokvecs
+
+NUMPY_OPS = NumpyOps()
+
+def step_forward(model: ParserStepModel, states, is_train):
+    token_ids = model.get_token_ids(states)
+    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
+    mask = None
+    if model.attrs["has_upper"]:
+        dropout_rate = model.attrs["dropout_rate"]
+        if is_train and dropout_rate > 0:
+            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
+            vector *= mask
+        scores, get_d_vector = model.vec2scores(vector, is_train)
+    else:
+        scores = NumpyOps().asarray(vector)
+        get_d_vector = lambda d_scores: d_scores
+    # If the class is unseen, make sure its score is minimum
+    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
+
+    def backprop_parser_step(d_scores):
+        # Zero vectors for unseen classes
+        d_scores *= model._class_mask
+        d_vector = get_d_vector(d_scores)
+        if mask is not None:
+            d_vector *= mask
+        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
+        return None
+    return scores, backprop_parser_step
+
+
+cdef class precompute_hiddens:
+    """Allow a model to be "primed" by pre-computing input features in bulk.
+
+    This is used for the parser, where we want to take a batch of documents,
+    and compute vectors for each (token, position) pair. These vectors can then
+    be reused, especially for beam-search.
+
+    Let's say we're using 12 features for each state, e.g. word at start of
+    buffer, three words on stack, their children, etc. In the normal arc-eager
+    system, a document of length N is processed in 2*N states. This means we'll
+    create 2*N*12 feature vectors --- but if we pre-compute, we only need
+    N*12 vector computations. The saving for beam-search is much better:
+    if we have a beam of k, we'll normally make 2*N*12*K computations --
+    so we can save the factor k. This also gives a nice CPU/GPU division:
+    we can do all our hard maths up front, packed into large multiplications,
+    and do the hard-to-program parsing on the CPU.
+    """
+    cdef readonly int nF, nO, nP
+    cdef bint _is_synchronized
+    cdef public object ops
+    cdef public object numpy_ops
+    cdef public object _cpu_ops
+    cdef np.ndarray _features
+    cdef np.ndarray _cached
+    cdef np.ndarray bias
+    cdef object _cuda_stream
+    cdef object _bp_hiddens
+    cdef object activation
+
+    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
+                 activation="maxout", train=False):
+        gpu_cached, bp_features = lower_model(tokvecs, train)
+        cdef np.ndarray cached
+        if not isinstance(gpu_cached, numpy.ndarray):
+            # Note the passing of cuda_stream here: it lets
+            # cupy make the copy asynchronously.
+            # We then have to block before first use.
+            cached = gpu_cached.get(stream=cuda_stream)
+        else:
+            cached = gpu_cached
+        if not isinstance(lower_model.get_param("b"), numpy.ndarray):
+            self.bias = lower_model.get_param("b").get(stream=cuda_stream)
+        else:
+            self.bias = lower_model.get_param("b")
+        self.nF = cached.shape[1]
+        if lower_model.has_dim("nP"):
+            self.nP = lower_model.get_dim("nP")
+        else:
+            self.nP = 1
+        self.nO = cached.shape[2]
+        self.ops = lower_model.ops
+        self.numpy_ops = NumpyOps()
+        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
+        assert activation in (None, "relu", "maxout")
+        self.activation = activation
+        self._is_synchronized = False
+        self._cuda_stream = cuda_stream
+        self._cached = cached
+        self._bp_hiddens = bp_features
+
+    cdef const float* get_feat_weights(self) except NULL:
+        if not self._is_synchronized and self._cuda_stream is not None:
+            self._cuda_stream.synchronize()
+            self._is_synchronized = True
+        return <float*>self._cached.data
+
+    def has_dim(self, name):
+        if name == "nF":
+            return self.nF if self.nF is not None else True
+        elif name == "nP":
+            return self.nP if self.nP is not None else True
+        elif name == "nO":
+            return self.nO if self.nO is not None else True
+        else:
+            return False
+
+    def get_dim(self, name):
+        if name == "nF":
+            return self.nF
+        elif name == "nP":
+            return self.nP
+        elif name == "nO":
+            return self.nO
+        else:
+            raise ValueError(Errors.E1033.format(name=name))
+
+    def set_dim(self, name, value):
+        if name == "nF":
+            self.nF = value
+        elif name == "nP":
+            self.nP = value
+        elif name == "nO":
+            self.nO = value
+        else:
+            raise ValueError(Errors.E1033.format(name=name))
+
+    def __call__(self, X, bint is_train):
+        if is_train:
+            return self.begin_update(X)
+        else:
+            return self.predict(X), lambda X: X
+
+    def predict(self, X):
+        return self.begin_update(X)[0]
+
+    def begin_update(self, token_ids):
+        cdef np.ndarray state_vector = numpy.zeros(
+            (token_ids.shape[0], self.nO, self.nP), dtype='f')
+        # This is tricky, but (assuming GPU available);
+        # - Input to forward on CPU
+        # - Output from forward on CPU
+        # - Input to backward on GPU!
+        # - Output from backward on GPU
+        bp_hiddens = self._bp_hiddens
+
+        cdef CBlas cblas = self._cpu_ops.cblas()
+
+        feat_weights = self.get_feat_weights()
+        cdef int[:, ::1] ids = token_ids
+        sum_state_features(cblas, <float*>state_vector.data,
+            feat_weights, &ids[0,0],
+            token_ids.shape[0], self.nF, self.nO*self.nP)
+        state_vector += self.bias
+        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
+
+        def backward(d_state_vector_ids):
+            d_state_vector, token_ids = d_state_vector_ids
+            d_state_vector = bp_nonlinearity(d_state_vector)
+            d_tokens = bp_hiddens((d_state_vector, token_ids))
+            return d_tokens
+        return state_vector, backward
+
+    def _nonlinearity(self, state_vector):
+        if self.activation == "maxout":
+            return self._maxout_nonlinearity(state_vector)
+        else:
+            return self._relu_nonlinearity(state_vector)
+
+    def _maxout_nonlinearity(self, state_vector):
+        state_vector, mask = self.numpy_ops.maxout(state_vector)
+        # We're outputting to CPU, but we need this variable on GPU for the
+        # backward pass.
+        mask = self.ops.asarray(mask)
+
+        def backprop_maxout(d_best):
+            return self.ops.backprop_maxout(d_best, mask, self.nP)
+        
+        return state_vector, backprop_maxout
+
+    def _relu_nonlinearity(self, state_vector):
+        state_vector = state_vector.reshape((state_vector.shape[0], -1))
+        mask = state_vector >= 0.
+        state_vector *= mask
+        # We're outputting to CPU, but we need this variable on GPU for the
+        # backward pass.
+        mask = self.ops.asarray(mask)
+
+        def backprop_relu(d_best):
+            d_best *= mask
+            return d_best.reshape((d_best.shape + (1,)))
+ 
+        return state_vector, backprop_relu
+
+cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
diff --git a/spacy/ml/tb_framework.pxd b/spacy/ml/tb_framework.pxd
deleted file mode 100644
index 965508519e8..00000000000
--- a/spacy/ml/tb_framework.pxd
+++ /dev/null
@@ -1,28 +0,0 @@
-from libc.stdint cimport int8_t
-
-
-cdef struct SizesC:
-    int states
-    int classes
-    int hiddens
-    int pieces
-    int feats
-    int embed_width
-    int tokens
-
-
-cdef struct WeightsC:
-    const float* feat_weights
-    const float* feat_bias
-    const float* hidden_bias
-    const float* hidden_weights
-    const int8_t* seen_mask
-
-
-cdef struct ActivationsC:
-    int* token_ids
-    float* unmaxed
-    float* hiddens
-    int* is_valid
-    int _curr_size
-    int _max_size
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
new file mode 100644
index 00000000000..ab4a969e24e
--- /dev/null
+++ b/spacy/ml/tb_framework.py
@@ -0,0 +1,50 @@
+from thinc.api import Model, noop
+from .parser_model import ParserStepModel
+from ..util import registry
+
+
+@registry.layers("spacy.TransitionModel.v1")
+def TransitionModel(
+    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
+):
+    """Set up a stepwise transition-based model"""
+    if upper is None:
+        has_upper = False
+        upper = noop()
+    else:
+        has_upper = True
+    # don't define nO for this object, because we can't dynamically change it
+    return Model(
+        name="parser_model",
+        forward=forward,
+        dims={"nI": tok2vec.maybe_get_dim("nI")},
+        layers=[tok2vec, lower, upper],
+        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
+        init=init,
+        attrs={
+            "has_upper": has_upper,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def forward(model, X, is_train):
+    step_model = ParserStepModel(
+        X,
+        model.layers,
+        unseen_classes=model.attrs["unseen_classes"],
+        train=is_train,
+        has_upper=model.attrs["has_upper"],
+    )
+
+    return step_model, step_model.finish_steps
+
+
+def init(model, X=None, Y=None):
+    model.get_ref("tok2vec").initialize(X=X)
+    lower = model.get_ref("lower")
+    lower.initialize()
+    if model.attrs["has_upper"]:
+        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
+        model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
deleted file mode 100644
index e497643f0cd..00000000000
--- a/spacy/ml/tb_framework.pyx
+++ /dev/null
@@ -1,639 +0,0 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, cast
-
-from libc.stdlib cimport calloc, free, realloc
-from libc.string cimport memcpy, memset
-from libcpp.vector cimport vector
-
-import numpy
-
-cimport numpy as np
-
-from thinc.api import (
-    Linear,
-    Model,
-    NumpyOps,
-    chain,
-    glorot_uniform_init,
-    list2array,
-    normal_init,
-    uniform_init,
-    zero_init,
-)
-
-from thinc.backends.cblas cimport CBlas, saxpy, sgemm
-
-from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
-
-from ..errors import Errors
-from ..pipeline._parser_internals import _beam_utils
-from ..pipeline._parser_internals.batch import GreedyBatch
-
-from ..pipeline._parser_internals._parser_utils cimport arg_max
-from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
-from ..pipeline._parser_internals.transition_system cimport (
-    TransitionSystem,
-    c_apply_actions,
-    c_transition_batch,
-)
-
-from ..tokens.doc import Doc
-from ..util import registry
-
-State = Any  # TODO
-
-
-@registry.layers("spacy.TransitionModel.v2")
-def TransitionModel(
-    *,
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    beam_width: int = 1,
-    beam_density: float = 0.0,
-    state_tokens: int,
-    hidden_width: int,
-    maxout_pieces: int,
-    nO: Optional[int] = None,
-    unseen_classes=set(),
-) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
-    """Set up a transition-based parsing model, using a maxout hidden
-    layer and a linear output layer.
-    """
-    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
-    tok2vec_projected.set_dim("nO", hidden_width)
-
-    # FIXME: we use `output` as a container for the output layer's
-    # weights and biases. Thinc optimizers cannot handle resizing
-    # of parameters. So, when the parser model is resized, we
-    # construct a new `output` layer, which has a different key in
-    # the optimizer. Once the optimizer supports parameter resizing,
-    # we can replace the `output` layer by `output_W` and `output_b`
-    # parameters in this model.
-    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
-
-    return Model(
-        name="parser_model",
-        forward=forward,
-        init=init,
-        layers=[tok2vec_projected, output],
-        refs={
-            "tok2vec": tok2vec_projected,
-            "output": output,
-        },
-        params={
-            "hidden_W": None,  # Floats2d W for the hidden layer
-            "hidden_b": None,  # Floats1d bias for the hidden layer
-            "hidden_pad": None,  # Floats1d padding for the hidden layer
-        },
-        dims={
-            "nO": None,  # Output size
-            "nP": maxout_pieces,
-            "nH": hidden_width,
-            "nI": tok2vec_projected.maybe_get_dim("nO"),
-            "nF": state_tokens,
-        },
-        attrs={
-            "beam_width": beam_width,
-            "beam_density": beam_density,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def resize_output(model: Model, new_nO: int) -> Model:
-    old_nO = model.maybe_get_dim("nO")
-    output = model.get_ref("output")
-    if old_nO is None:
-        model.set_dim("nO", new_nO)
-        output.set_dim("nO", new_nO)
-        output.initialize()
-        return model
-    elif new_nO <= old_nO:
-        return model
-    elif output.has_param("W"):
-        nH = model.get_dim("nH")
-        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
-        new_output.initialize()
-        new_W = new_output.get_param("W")
-        new_b = new_output.get_param("b")
-        old_W = output.get_param("W")
-        old_b = output.get_param("b")
-        new_W[:old_nO] = old_W  # type: ignore
-        new_b[:old_nO] = old_b  # type: ignore
-        for i in range(old_nO, new_nO):
-            model.attrs["unseen_classes"].add(i)
-        model.layers[-1] = new_output
-        model.set_ref("output", new_output)
-    # TODO: Avoid this private intrusion
-    model._dims["nO"] = new_nO
-    return model
-
-
-def init(
-    model,
-    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
-    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
-):
-    if X is not None:
-        docs, _ = X
-        model.get_ref("tok2vec").initialize(X=docs)
-    else:
-        model.get_ref("tok2vec").initialize()
-    inferred_nO = _infer_nO(Y)
-    if inferred_nO is not None:
-        current_nO = model.maybe_get_dim("nO")
-        if current_nO is None or current_nO != inferred_nO:
-            model.attrs["resize_output"](model, inferred_nO)
-    nP = model.get_dim("nP")
-    nH = model.get_dim("nH")
-    nI = model.get_dim("nI")
-    nF = model.get_dim("nF")
-    ops = model.ops
-
-    Wl = ops.alloc2f(nH * nP, nF * nI)
-    bl = ops.alloc1f(nH * nP)
-    padl = ops.alloc1f(nI)
-    # Wl = zero_init(ops, Wl.shape)
-    Wl = glorot_uniform_init(ops, Wl.shape)
-    padl = uniform_init(ops, padl.shape)  # type: ignore
-    # TODO: Experiment with whether better to initialize output_W
-    model.set_param("hidden_W", Wl)
-    model.set_param("hidden_b", bl)
-    model.set_param("hidden_pad", padl)
-    # model = _lsuv_init(model)
-    return model
-
-
-class TransitionModelInputs:
-    """
-    Input to transition model.
-    """
-
-    # dataclass annotation is not yet supported in Cython 0.29.x,
-    # so, we'll do something close to it.
-
-    actions: Optional[List[Ints1d]]
-    docs: List[Doc]
-    max_moves: int
-    moves: TransitionSystem
-    states: Optional[List[State]]
-
-    __slots__ = [
-        "actions",
-        "docs",
-        "max_moves",
-        "moves",
-        "states",
-    ]
-
-    def __init__(
-        self,
-        docs: List[Doc],
-        moves: TransitionSystem,
-        actions: Optional[List[Ints1d]] = None,
-        max_moves: int = 0,
-        states: Optional[List[State]] = None,
-    ):
-        """
-        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
-        docs (List[Doc]): Docs to predict transition sequences for.
-        max_moves: (int): the maximum number of moves to apply, values less
-            than 1 will apply moves to states until they are final states.
-        moves (TransitionSystem): the transition system to use when predicting
-            the transition sequences.
-        states (Optional[List[States]]): the initial states to predict the
-            transition sequences for. When absent, the initial states are
-            initialized from the provided Docs.
-        """
-        self.actions = actions
-        self.docs = docs
-        self.moves = moves
-        self.max_moves = max_moves
-        self.states = states
-
-
-def forward(model, inputs: TransitionModelInputs, is_train: bool):
-    docs = inputs.docs
-    moves = inputs.moves
-    actions = inputs.actions
-
-    beam_width = model.attrs["beam_width"]
-    hidden_pad = model.get_param("hidden_pad")
-    tok2vec = model.get_ref("tok2vec")
-
-    states = moves.init_batch(docs) if inputs.states is None else inputs.states
-    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
-    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
-    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
-    seen_mask = _get_seen_mask(model)
-
-    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
-        # Note: max_moves is only used during training, so we don't need to
-        #       pass it to the greedy inference path.
-        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
-    else:
-        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
-                                 feats, backprop_feats, seen_mask, is_train, actions=actions,
-                                 max_moves=inputs.max_moves)
-
-
-def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
-                        np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None):
-    cdef vector[StateC*] c_states
-    cdef StateClass state
-    for state in states:
-        if not state.is_final():
-            c_states.push_back(state.c)
-    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
-    # Precomputed features have rows for each token, plus one for padding.
-    cdef int n_tokens = feats.shape[0] - 1
-    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
-    cdef CBlas cblas = model.ops.cblas()
-    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
-
-    def backprop(dY):
-        raise ValueError(Errors.E4004)
-
-    return (states, scores), backprop
-
-
-cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
-                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i
-    cdef vector[StateC *] unfinished
-    cdef ActivationsC activations = _alloc_activations(sizes)
-    cdef np.ndarray step_scores
-    cdef np.ndarray step_actions
-
-    scores = []
-    while sizes.states >= 1:
-        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
-        step_actions = actions[0] if actions is not None else None
-        with nogil:
-            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
-            if actions is None:
-                # Validate actions, argmax, take action.
-                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
-                                   sizes.states)
-            else:
-                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
-            for i in range(sizes.states):
-                if not states[i].is_final():
-                    unfinished.push_back(states[i])
-            for i in range(unfinished.size()):
-                states[i] = unfinished[i]
-        sizes.states = unfinished.size()
-        scores.append(step_scores)
-        unfinished.clear()
-        actions = actions[1:] if actions is not None else None
-    _free_activations(&activations)
-
-    return scores
-
-
-def _forward_fallback(
-    model: Model,
-    moves: TransitionSystem,
-    states: List[StateClass],
-    tokvecs, backprop_tok2vec,
-    feats,
-    backprop_feats,
-    seen_mask,
-    is_train: bool,
-    actions: Optional[List[Ints1d]] = None,
-    max_moves: int = 0,
-):
-    nF = model.get_dim("nF")
-    output = model.get_ref("output")
-    hidden_b = model.get_param("hidden_b")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-
-    beam_width = model.attrs["beam_width"]
-    beam_density = model.attrs["beam_density"]
-
-    ops = model.ops
-
-    all_ids = []
-    all_which = []
-    all_statevecs = []
-    all_scores = []
-    if beam_width == 1:
-        batch = GreedyBatch(moves, states, None)
-    else:
-        batch = _beam_utils.BeamBatch(
-            moves, states, None, width=beam_width, density=beam_density
-        )
-    arange = ops.xp.arange(nF)
-    n_moves = 0
-    while not batch.is_done:
-        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
-        for i, state in enumerate(batch.get_unfinished_states()):
-            state.set_context_tokens(ids, i, nF)
-        # Sum the state features, add the bias and apply the activation (maxout)
-        # to create the state vectors.
-        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
-        preacts2f += hidden_b
-        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
-        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
-        statevecs, which = ops.maxout(preacts)
-        # We don't use output's backprop, since we want to backprop for
-        # all states at once, rather than a single state.
-        scores = output.predict(statevecs)
-        scores[:, seen_mask] = ops.xp.nanmin(scores)
-        # Transition the states, filtering out any that are finished.
-        cpu_scores = ops.to_numpy(scores)
-        if actions is None:
-            batch.advance(cpu_scores)
-        else:
-            batch.advance_with_actions(actions[0])
-            actions = actions[1:]
-        all_scores.append(scores)
-        if is_train:
-            # Remember intermediate results for the backprop.
-            all_ids.append(ids)
-            all_statevecs.append(statevecs)
-            all_which.append(which)
-        if n_moves >= max_moves >= 1:
-            break
-        n_moves += 1
-
-    def backprop_parser(d_states_d_scores):
-        ids = ops.xp.vstack(all_ids)
-        which = ops.xp.vstack(all_which)
-        statevecs = ops.xp.vstack(all_statevecs)
-        _, d_scores = d_states_d_scores
-        if model.attrs.get("unseen_classes"):
-            # If we have a negative gradient (i.e. the probability should
-            # increase) on any classes we filtered out as unseen, mark
-            # them as seen.
-            for clas in set(model.attrs["unseen_classes"]):
-                if (d_scores[:, clas] < 0).any():
-                    model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False  # no-cython-lint
-        # Calculate the gradients for the parameters of the output layer.
-        # The weight gemm is (nS, nO) @ (nS, nH).T
-        output.inc_grad("b", d_scores.sum(axis=0))
-        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
-        # Now calculate d_statevecs, by backproping through the output linear layer.
-        # This gemm is (nS, nO) @ (nO, nH)
-        output_W = output.get_param("W")
-        d_statevecs = ops.gemm(d_scores, output_W)
-        # Backprop through the maxout activation
-        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
-        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
-        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
-        # We don't need to backprop the summation, because we pass back the IDs instead
-        d_state_features = backprop_feats((d_preacts2f, ids))
-        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
-        ops.scatter_add(d_tokvecs, ids, d_state_features)
-        model.inc_grad("hidden_pad", d_tokvecs[-1])
-        return (backprop_tok2vec(d_tokvecs[:-1]), None)
-
-    return (list(batch), all_scores), backprop_parser
-
-
-def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
-    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
-    for class_ in model.attrs.get("unseen_classes", set()):
-        mask[class_] = True
-    return mask
-
-
-def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
-    W: Floats2d = model.get_param("hidden_W")
-    nF = model.get_dim("nF")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
-    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
-    W3f = W3f.transpose((1, 0, 2))
-    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
-    assert X.shape == (X.shape[0], nI), X.shape
-    Yf_ = model.ops.gemm(X, W2f, trans2=True)
-    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
-
-    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nH, nP), and get back:
-        # (nB, nH, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nH, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        dXf = model.ops.gemm(dY, W)
-        Xf = X[ids].reshape((ids.shape[0], -1))
-        dW = model.ops.gemm(dY, Xf, trans1=True)
-        model.inc_grad("hidden_W", dW)
-        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
-
-    return Yf, backward
-
-
-def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
-    if Y is None:
-        return None
-    _, scores = Y
-    if len(scores) == 0:
-        return None
-    assert scores[0].shape[0] >= 1
-    assert len(scores[0].shape) == 2
-    return scores[0].shape[1]
-
-
-def _lsuv_init(model: Model):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    W = model.maybe_get_param("hidden_W")
-    if W is not None and W.any():
-        return
-
-    nF = model.get_dim("nF")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nH, nP, nI)
-    b = model.ops.alloc2f(nH, nP)
-    pad = model.ops.alloc4f(1, nF, nH, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc_f((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc_f((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
-        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
-        vectors3f += b
-        return model.ops.maxout(vectors3f)[0]
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = cast(Floats4d, model.get_param("hidden_W").copy())
-    b = cast(Floats2d, model.get_param("hidden_b").copy())
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("hidden_W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("hidden_b", b)
-        else:
-            break
-    return model
-
-
-cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
-    output = model.get_ref("output")
-    cdef np.ndarray hidden_b = model.get_param("hidden_b")
-    cdef np.ndarray output_W = output.get_param("W")
-    cdef np.ndarray output_b = output.get_param("b")
-
-    cdef WeightsC weights
-    weights.feat_weights = feats
-    weights.feat_bias = <const float*>hidden_b.data
-    weights.hidden_weights = <const float *> output_W.data
-    weights.hidden_bias = <const float *> output_b.data
-    weights.seen_mask = <const int8_t*> seen_mask.data
-
-    return weights
-
-
-cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
-    cdef SizesC sizes
-    sizes.states = batch_size
-    sizes.classes = model.get_dim("nO")
-    sizes.hiddens = model.get_dim("nH")
-    sizes.pieces = model.get_dim("nP")
-    sizes.feats = model.get_dim("nF")
-    sizes.embed_width = model.get_dim("nI")
-    sizes.tokens = tokens
-    return sizes
-
-
-cdef ActivationsC _alloc_activations(SizesC n) nogil:
-    cdef ActivationsC A
-    memset(&A, 0, sizeof(A))
-    _resize_activations(&A, n)
-    return A
-
-
-cdef void _free_activations(const ActivationsC* A) nogil:
-    free(A.token_ids)
-    free(A.unmaxed)
-    free(A.hiddens)
-    free(A.is_valid)
-
-
-cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
-    if n.states <= A._max_size:
-        A._curr_size = n.states
-        return
-    if A._max_size == 0:
-        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
-        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
-        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    else:
-        A.token_ids = <int*>realloc(A.token_ids,
-                                    n.states * n.feats * sizeof(A.token_ids[0]))
-        A.unmaxed = <float*>realloc(A.unmaxed,
-                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>realloc(A.hiddens,
-                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
-        A.is_valid = <int*>realloc(A.is_valid,
-                                   n.states * n.classes * sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    A._curr_size = n.states
-
-
-cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
-    _resize_activations(A, n)
-    for i in range(n.states):
-        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
-    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
-    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
-    for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
-        for j in range(n.hiddens):
-            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = arg_max(&A.unmaxed[index], n.pieces)
-            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
-    if W.hidden_weights == NULL:
-        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
-    else:
-        # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-                     1.0, <const float *>A.hiddens, n.hiddens,
-                     <const float *>W.hidden_weights, n.hiddens,
-                     0.0, scores, n.classes)
-        # Add bias
-        for i in range(n.states):
-            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
-    # Set unseen classes to minimum value
-    i = 0
-    min_ = scores[0]
-    for i in range(1, n.states * n.classes):
-        if scores[i] < min_:
-            min_ = scores[i]
-    for i in range(n.states):
-        for j in range(n.classes):
-            if W.seen_mask[j]:
-                scores[i*n.classes+j] = min_
-
-
-cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
-                              const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f
-    cdef const float* feature
-    cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces  # no-cython-lint
-    cdef int F = n.feats
-    cdef int T = n.tokens
-    padding = cached + (T * F * O)
-    cdef int id_stride = F*O
-    cdef float one = 1.
-    for b in range(B):
-        for f in range(F):
-            if token_ids[f] < 0:
-                feature = &padding[f*O]
-            else:
-                idx = token_ids[f] * id_stride + f*O
-                feature = &cached[idx]
-            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
-        token_ids += F
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index 7c546752d80..273cc6c1078 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -8,8 +8,6 @@ from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
-
-from .batch cimport Batch
 from .search cimport Beam, MaxViolation
 
 from .search import MaxViolation
@@ -31,7 +29,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1:
     return state.is_final()
 
 
-cdef class BeamBatch(Batch):
+cdef class BeamBatch(object):
     cdef public TransitionSystem moves
     cdef public object states
     cdef public object docs
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pxd b/spacy/pipeline/_parser_internals/_parser_utils.pxd
deleted file mode 100644
index 7fee05bad60..00000000000
--- a/spacy/pipeline/_parser_internals/_parser_utils.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef int arg_max(const float* scores, const int n_classes) nogil
-cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pyx b/spacy/pipeline/_parser_internals/_parser_utils.pyx
deleted file mode 100644
index 582756bf5be..00000000000
--- a/spacy/pipeline/_parser_internals/_parser_utils.pyx
+++ /dev/null
@@ -1,22 +0,0 @@
-# cython: infer_types=True
-
-cdef inline int arg_max(const float* scores, const int n_classes) nogil:
-    if n_classes == 2:
-        return 0 if scores[0] > scores[1] else 1
-    cdef int i
-    cdef int best = 0
-    cdef float mode = scores[0]
-    for i in range(1, n_classes):
-        if scores[i] > mode:
-            mode = scores[i]
-            best = i
-    return best
-
-
-cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
-    cdef int best = -1
-    for i in range(n):
-        if is_valid[i] >= 1:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 1c61ac271d8..04274ce8af1 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -7,6 +7,8 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.set cimport set
+from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
@@ -26,7 +28,7 @@ cdef struct ArcC:
 
 
 cdef cppclass StateC:
-    vector[int] _heads
+    int* _heads
     const TokenC* _sent
     vector[int] _stack
     vector[int] _rebuffer
@@ -34,34 +36,31 @@ cdef cppclass StateC:
     unordered_map[int, vector[ArcC]] _left_arcs
     unordered_map[int, vector[ArcC]] _right_arcs
     vector[libcpp.bool] _unshiftable
-    vector[int] history
     set[int] _sent_starts
     TokenC _empty_token
     int length
     int offset
     int _b_i
 
-    __init__(const TokenC* sent, int length) nogil except +:
-        this._heads.resize(length, -1)
-        this._unshiftable.resize(length, False)
-
-        # Reserve memory ahead of time to minimize allocations during parsing.
-        # The initial capacity set here ideally reflects the expected average-case/majority usage.
-        cdef int init_capacity = 32
-        this._stack.reserve(init_capacity)
-        this._rebuffer.reserve(init_capacity)
-        this._ents.reserve(init_capacity)
-        this._left_arcs.reserve(init_capacity)
-        this._right_arcs.reserve(init_capacity)
-        this.history.reserve(init_capacity)
-
+    __init__(const TokenC* sent, int length) nogil:
         this._sent = sent
+        this._heads = <int*>calloc(length, sizeof(int))
+        if not (this._sent and this._heads):
+            with gil:
+                PyErr_SetFromErrno(MemoryError)
+                PyErr_CheckSignals()
         this.offset = 0
         this.length = length
         this._b_i = 0
+        for i in range(length):
+            this._heads[i] = -1
+            this._unshiftable.push_back(0)
         memset(&this._empty_token, 0, sizeof(TokenC))
         this._empty_token.lex = &EMPTY_LEXEME
 
+    __dealloc__():
+        free(this._heads)
+
     void set_context_tokens(int* ids, int n) nogil:
         cdef int i, j
         if n == 1:
@@ -134,20 +133,19 @@ cdef cppclass StateC:
                 ids[i] = -1
 
     int S(int i) nogil const:
-        cdef int stack_size = this._stack.size()
-        if i >= stack_size or i < 0:
+        if i >= this._stack.size():
             return -1
-        else:
-            return this._stack[stack_size - (i+1)]
+        elif i < 0:
+            return -1
+        return this._stack.at(this._stack.size() - (i+1))
 
     int B(int i) nogil const:
-        cdef int buf_size = this._rebuffer.size()
         if i < 0:
             return -1
-        elif i < buf_size:
-            return this._rebuffer[buf_size - (i+1)]
+        elif i < this._rebuffer.size():
+            return this._rebuffer.at(this._rebuffer.size() - (i+1))
         else:
-            b_i = this._b_i + (i - buf_size)
+            b_i = this._b_i + (i - this._rebuffer.size())
             if b_i >= this.length:
                 return -1
             else:
@@ -246,7 +244,7 @@ cdef cppclass StateC:
             return 0
         elif this._sent[word].sent_start == 1:
             return 1
-        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
+        elif this._sent_starts.count(word) >= 1:
             return 1
         else:
             return 0
@@ -330,7 +328,7 @@ cdef cppclass StateC:
         if item >= this._unshiftable.size():
             return 0
         else:
-            return this._unshiftable[item]
+            return this._unshiftable.at(item)
 
     void set_reshiftable(int item) nogil:
         if item < this._unshiftable.size():
@@ -350,9 +348,6 @@ cdef cppclass StateC:
         this._heads[child] = head
 
     void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
-        cdef vector[ArcC]* arcs
-        cdef ArcC* arc
-
         arcs_it = heads_arcs.find(h_i)
         if arcs_it == heads_arcs.end():
             return
@@ -361,12 +356,12 @@ cdef cppclass StateC:
         if arcs.size() == 0:
             return
 
-        arc = &arcs.back()
+        arc = arcs.back()
         if arc.head == h_i and arc.child == c_i:
             arcs.pop_back()
         else:
             for i in range(arcs.size()-1):
-                arc = &deref(arcs)[i]
+                arc = arcs.at(i)
                 if arc.head == h_i and arc.child == c_i:
                     arc.head = -1
                     arc.child = -1
@@ -406,11 +401,10 @@ cdef cppclass StateC:
         this._rebuffer = src._rebuffer
         this._sent_starts = src._sent_starts
         this._unshiftable = src._unshiftable
-        this._heads = src._heads
+        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
         this._ents = src._ents
         this._left_arcs = src._left_arcs
         this._right_arcs = src._right_arcs
         this._b_i = src._b_i
         this.offset = src.offset
         this._empty_token = src._empty_token
-        this.history = src.history
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 08f60b2634b..6ffceae10d3 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -779,8 +779,6 @@ cdef class ArcEager(TransitionSystem):
         return list(arcs)
 
     def has_gold(self, Example eg, start=0, end=None):
-        if end is not None and end < 0:
-            end = None
         for word in eg.y[start:end]:
             if word.dep != 0:
                 return True
@@ -865,7 +863,6 @@ cdef class ArcEager(TransitionSystem):
                             state.print_state()
                         )))
                     action.do(state.c, action.label)
-                    state.c.history.push_back(i)
                     break
             else:
                 failed = False
diff --git a/spacy/pipeline/_parser_internals/batch.pxd b/spacy/pipeline/_parser_internals/batch.pxd
deleted file mode 100644
index 60734e549aa..00000000000
--- a/spacy/pipeline/_parser_internals/batch.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef class Batch:
-    pass
diff --git a/spacy/pipeline/_parser_internals/batch.pyx b/spacy/pipeline/_parser_internals/batch.pyx
deleted file mode 100644
index 91073b52e68..00000000000
--- a/spacy/pipeline/_parser_internals/batch.pyx
+++ /dev/null
@@ -1,52 +0,0 @@
-from typing import Any
-
-TransitionSystem = Any  # TODO
-
-cdef class Batch:
-    def advance(self, scores):
-        raise NotImplementedError
-
-    def get_states(self):
-        raise NotImplementedError
-
-    @property
-    def is_done(self):
-        raise NotImplementedError
-
-    def get_unfinished_states(self):
-        raise NotImplementedError
-
-    def __getitem__(self, i):
-        raise NotImplementedError
-
-    def __len__(self):
-        raise NotImplementedError
-
-
-class GreedyBatch(Batch):
-    def __init__(self, moves: TransitionSystem, states, golds):
-        self._moves = moves
-        self._states = states
-        self._next_states = [s for s in states if not s.is_final()]
-
-    def advance(self, scores):
-        self._next_states = self._moves.transition_states(self._next_states, scores)
-
-    def advance_with_actions(self, actions):
-        self._next_states = self._moves.apply_actions(self._next_states, actions)
-
-    def get_states(self):
-        return self._states
-
-    @property
-    def is_done(self):
-        return all(s.is_final() for s in self._states)
-
-    def get_unfinished_states(self):
-        return [st for st in self._states if not st.is_final()]
-
-    def __getitem__(self, i):
-        return self._states[i]
-
-    def __len__(self):
-        return len(self._states)
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 3a352f51ff5..0b9980ddbf2 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -159,7 +159,7 @@ cdef class BiluoPushDown(TransitionSystem):
             if token.ent_type:
                 labels.add(token.ent_type_)
         return labels
-
+    
     def move_name(self, int move, attr_t label):
         if move == OUT:
             return 'O'
@@ -309,8 +309,6 @@ cdef class BiluoPushDown(TransitionSystem):
             for span in eg.y.spans.get(neg_key, []):
                 if span.start >= start and span.end <= end:
                     return True
-        if end is not None and end < 0:
-            end = None
         for word in eg.y[start:end]:
             if word.ent_iob != 0:
                 return True
@@ -646,7 +644,7 @@ cdef class Unit:
                 cost += 1
                 break
         return cost
-
+ 
 
 
 cdef class Out:
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index c2a0d22956a..f25408a13ba 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -21,10 +21,6 @@ cdef class StateClass:
         if self._borrowed != 1:
             del self.c
 
-    @property
-    def history(self):
-        return list(self.c.history)
-
     @property
     def stack(self):
         return [self.S(i) for i in range(self.c.stack_depth())]
@@ -181,6 +177,3 @@ cdef class StateClass:
 
     def clone(self, StateClass src):
         self.c.clone(src.c)
-
-    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
-        self.c.set_context_tokens(&output[row, 0], n_feats)
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 08baed932ba..04cd10d8864 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -57,10 +57,3 @@ cdef class TransitionSystem:
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        const StateC* state, gold) except -1
-
-
-cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-                          int batch_size) nogil
-
-cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-                             int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index 50b155bf9bb..485ce7c10bd 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -3,18 +3,12 @@
 from __future__ import print_function
 
 from cymem.cymem cimport Pool
-from libc.stdlib cimport calloc, free
-from libcpp.vector cimport vector
 
 from collections import Counter
 
 import srsly
 
 from ...structs cimport TokenC
-from ...tokens.doc cimport Doc
-from ...typedefs cimport attr_t, weight_t
-from . cimport _beam_utils
-from ._parser_utils cimport arg_max_if_valid
 from .stateclass cimport StateClass
 
 from ... import util
@@ -79,18 +73,7 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
-    def follow_history(self, doc, history):
-        cdef int clas
-        cdef StateClass state = StateClass(doc)
-        for clas in history:
-            action = self.c[clas]
-            action.do(state.c, action.label)
-            state.c.history.push_back(clas)
-        return state
-
     def get_oracle_sequence(self, Example example, _debug=False):
-        if not self.has_gold(example):
-            return []
         states, golds, _ = self.init_gold_batch([example])
         if not states:
             return []
@@ -102,8 +85,6 @@ cdef class TransitionSystem:
             return self.get_oracle_sequence_from_state(state, gold)
 
     def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
-        if state.is_final():
-            return []
         cdef Pool mem = Pool()
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         assert self.n_moves > 0
@@ -129,7 +110,6 @@ cdef class TransitionSystem:
                             "S0 head?", str(state.has_head(state.S(0))),
                         )))
                     action.do(state.c, action.label)
-                    state.c.history.push_back(i)
                     break
             else:
                 if _debug:
@@ -157,28 +137,6 @@ cdef class TransitionSystem:
             raise ValueError(Errors.E170.format(name=name))
         action = self.lookup_transition(name)
         action.do(state.c, action.label)
-        state.c.history.push_back(action.clas)
-
-    def apply_actions(self, states, const int[::1] actions):
-        assert len(states) == actions.shape[0]
-        cdef StateClass state
-        cdef vector[StateC*] c_states
-        c_states.resize(len(states))
-        cdef int i
-        for (i, state) in enumerate(states):
-            c_states[i] = state.c
-        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
-        return [state for state in states if not state.c.is_final()]
-
-    def transition_states(self, states, float[:, ::1] scores):
-        assert len(states) == scores.shape[0]
-        cdef StateClass state
-        cdef float* c_scores = &scores[0, 0]
-        cdef vector[StateC*] c_states
-        for state in states:
-            c_states.push_back(state.c)
-        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
-        return [state for state in states if not state.c.is_final()]
 
     cdef Transition lookup_transition(self, object name) except *:
         raise NotImplementedError
@@ -291,34 +249,3 @@ cdef class TransitionSystem:
             self.cfg.update(msg['cfg'])
         self.initialize_actions(labels)
         return self
-
-
-cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-                          int batch_size) nogil:
-    cdef int i
-    cdef Transition action
-    cdef StateC* state
-    for i in range(batch_size):
-        state = states[i]
-        action = moves.c[actions[i]]
-        action.do(state, action.label)
-        state.history.push_back(action.clas)
-
-
-cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-                             int nr_class, int batch_size) nogil:
-    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
-    cdef int i, guess
-    cdef Transition action
-    for i in range(batch_size):
-        moves.set_valid(is_valid, states[i])
-        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
-        if guess == -1:
-            # This shouldn't happen, but it's hard to raise an error here,
-            # and we don't want to infinite loop. So, force to end state.
-            states[i].force_final()
-        else:
-            action = moves.c[guess]
-            action.do(states[i], action.label)
-            states[i].history.push_back(guess)
-    free(is_valid)
diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.pyx
similarity index 96%
rename from spacy/pipeline/dep_parser.py
rename to spacy/pipeline/dep_parser.pyx
index b4961487b83..3e59deaae4f 100644
--- a/spacy/pipeline/dep_parser.py
+++ b/spacy/pipeline/dep_parser.pyx
@@ -4,6 +4,10 @@
 
 from thinc.api import Config, Model
 
+from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser cimport Parser
+from ._parser_internals.arc_eager cimport ArcEager
+
 from ..language import Language
 from ..scorer import Scorer
 from ..training import remove_bilu_prefix
@@ -17,11 +21,12 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -121,7 +126,6 @@ def make_parser(
         scorer=scorer,
     )
 
-
 @Language.factory(
     "beam_parser",
     assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
@@ -227,7 +231,6 @@ def parser_score(examples, **kwargs):
 
     DOCS: https://spacy.io/api/dependencyparser#score
     """
-
     def has_sents(doc):
         return doc.has_annotation("SENT_START")
 
@@ -235,11 +238,8 @@ def dep_getter(token, attr):
         dep = getattr(token, attr)
         dep = token.vocab.strings.as_string(dep).lower()
         return dep
-
     results = {}
-    results.update(
-        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
-    )
+    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
     kwargs.setdefault("getter", dep_getter)
     kwargs.setdefault("ignore_labels", ("p", "punct"))
     results.update(Scorer.score_deps(examples, "dep", **kwargs))
@@ -252,12 +252,11 @@ def make_parser_scorer():
     return parser_score
 
 
-class DependencyParser(Parser):
+cdef class DependencyParser(Parser):
     """Pipeline component for dependency parsing.
 
     DOCS: https://spacy.io/api/dependencyparser
     """
-
     TransitionSystem = ArcEager
 
     def __init__(
@@ -277,7 +276,8 @@ def __init__(
         incorrect_spans_key=None,
         scorer=parser_score,
     ):
-        """Create a DependencyParser."""
+        """Create a DependencyParser.
+        """
         super().__init__(
             vocab,
             model,
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.pyx
similarity index 93%
rename from spacy/pipeline/ner.py
rename to spacy/pipeline/ner.pyx
index 1c7cf151385..c73ec5c528a 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.pyx
@@ -10,15 +10,22 @@
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
+from .transition_parser cimport Parser
+from ._parser_internals.ner cimport BiluoPushDown
+from ..language import Language
+from ..scorer import get_ner_prf, PRFScore
+from ..util import registry
+from ..training import remove_bilu_prefix
+
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -43,12 +50,8 @@
         "incorrect_spans_key": None,
         "scorer": {"@scorers": "spacy.ner_scorer.v1"},
     },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+
 )
 def make_ner(
     nlp: Language,
@@ -101,7 +104,6 @@ def make_ner(
         scorer=scorer,
     )
 
-
 @Language.factory(
     "beam_ner",
     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
@@ -115,12 +117,7 @@ def make_ner(
         "incorrect_spans_key": None,
         "scorer": None,
     },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 )
 def make_beam_ner(
     nlp: Language,
@@ -194,12 +191,11 @@ def make_ner_scorer():
     return ner_score
 
 
-class EntityRecognizer(Parser):
+cdef class EntityRecognizer(Parser):
     """Pipeline component for named entity recognition.
 
     DOCS: https://spacy.io/api/entityrecognizer
     """
-
     TransitionSystem = BiluoPushDown
 
     def __init__(
@@ -217,14 +213,15 @@ def __init__(
         incorrect_spans_key=None,
         scorer=ner_score,
     ):
-        """Create an EntityRecognizer."""
+        """Create an EntityRecognizer.
+        """
         super().__init__(
             vocab,
             model,
             name,
             moves,
             update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,  # not relevant for NER
+            min_action_freq=1,   # not relevant for NER
             learn_tokens=False,  # not relevant for NER
             beam_width=beam_width,
             beam_density=beam_density,
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
new file mode 100644
index 00000000000..f20e69a6e56
--- /dev/null
+++ b/spacy/pipeline/transition_parser.pxd
@@ -0,0 +1,21 @@
+from cymem.cymem cimport Pool
+from thinc.backends.cblas cimport CBlas
+
+from ..vocab cimport Vocab
+from .trainable_pipe cimport TrainablePipe
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ._parser_internals._state cimport StateC
+from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
+
+
+cdef class Parser(TrainablePipe):
+    cdef public object _rehearsal_model
+    cdef readonly TransitionSystem moves
+    cdef public object _multitasks
+    cdef object _cpu_ops
+
+    cdef void _parseC(self, CBlas cblas, StateC** states,
+            WeightsC weights, SizesC sizes) nogil
+
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
+            int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 99970b3fe93..4290420c788 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,16 +1,21 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
-
 from typing import Dict, Iterable, List, Optional, Tuple
-
-cimport numpy as np
 from cymem.cymem cimport Pool
-
-import contextlib
-import random
+cimport numpy as np
 from itertools import islice
+from libcpp.vector cimport vector
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free
+import random
 
+import srsly
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
+from thinc.api import chain, softmax_activation, use_ops
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d
+import numpy.random
 import numpy
 import numpy.random
 import srsly
@@ -24,7 +29,16 @@ from thinc.api import (
 )
 from thinc.types import Floats2d, Ints1d
 
-from ..ml.tb_framework import TransitionModelInputs
+from ._parser_internals.stateclass cimport StateClass
+from ._parser_internals.search cimport Beam
+from ..ml.parser_model cimport alloc_activations, free_activations
+from ..ml.parser_model cimport predict_states, arg_max_if_valid
+from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
+from ..ml.parser_model cimport get_c_weights, get_c_sizes
+from ..tokens.doc cimport Doc
+from .trainable_pipe import TrainablePipe
+from ._parser_internals cimport _beam_utils
+from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
 from ..typedefs cimport weight_t
@@ -52,7 +66,7 @@ cdef extern from "<algorithm>" namespace "std" nogil:
 NUMPY_OPS = NumpyOps()
 
 
-class Parser(TrainablePipe):
+cdef class Parser(TrainablePipe):
     """
     Base class of the DependencyParser and EntityRecognizer.
     """
@@ -152,9 +166,8 @@ class Parser(TrainablePipe):
     @property
     def move_names(self):
         names = []
-        cdef TransitionSystem moves = self.moves
         for i in range(self.moves.n_moves):
-            name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
+            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
             # Explicitly removing the internal "U-" token used for blocking entities
             if name != "U-":
                 names.append(name)
@@ -261,6 +274,15 @@ class Parser(TrainablePipe):
 
         student_docs = [eg.predicted for eg in examples]
 
+        teacher_step_model = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_step_model, backprop_tok2vec = self.model.begin_update(student_docs)
+
+        # Add softmax activation, so that we can compute student losses
+        # with cross-entropy loss.
+        with use_ops("numpy"):
+            teacher_model = chain(teacher_step_model, softmax_activation())
+            student_model = chain(student_step_model, softmax_activation())
+        
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
@@ -268,38 +290,50 @@ class Parser(TrainablePipe):
             # sequence, we use the teacher's predictions as the gold
             # standard.
             max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states = self._init_batch(teacher_pipe, student_docs, max_moves)
+            states = self._init_batch(teacher_step_model, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
-        # We distill as follows: 1. we first let the student predict transition
-        # sequences (and the corresponding transition probabilities); (2) we
-        # let the teacher follow the student's predicted transition sequences
-        # to obtain the teacher's transition probabilities; (3) we compute the
-        # gradients of the student's transition distributions relative to the
-        # teacher's distributions.
-
-        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
-            max_moves=max_moves)
-        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
-        teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-            moves=self.moves, actions=actions)
-        (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
+        loss = 0.0
+        n_moves = 0
+        while states:
+            # We do distillation as follows: (1) for every state, we compute the
+            # transition softmax distributions: (2) we backpropagate the error of
+            # the student (compared to the teacher) into the student model; (3)
+            # for all states, we move to the next state using the student's
+            # predictions.
+            teacher_scores = teacher_model.predict(states)
+            student_scores, backprop = student_model.begin_update(states)
+            state_loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+            backprop(d_scores)
+            loss += state_loss
+            self.transition_states(states, student_scores)
+            states = [state for state in states if not state.is_final()]
+
+            # Stop when we reach the maximum number of moves, otherwise we start
+            # to process the remainder of cut sequences again.
+            if max_moves >= 1 and n_moves >= max_moves:
+                break
+            n_moves += 1
 
-        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
-        backprop_scores((student_states, d_scores))
+        backprop_tok2vec(student_docs)
 
         if sgd is not None:
             self.finish_update(sgd)
 
         losses[self.name] += loss
 
+        del backprop
+        del backprop_tok2vec
+        teacher_step_model.clear_memory()
+        student_step_model.clear_memory()
+        del teacher_model
+        del student_model
+
         return losses
 
     def get_teacher_student_loss(
-            self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
-            normalize: bool = False,
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
     ) -> Tuple[float, List[Floats2d]]:
         """Calculate the loss and its gradient for a batch of student
         scores, relative to teacher scores.
@@ -311,28 +345,10 @@ class Parser(TrainablePipe):
         
         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
         """
-
-        # We can't easily hook up a softmax layer in the parsing model, since
-        # the get_loss does additional masking. So, we could apply softmax
-        # manually here and use Thinc's cross-entropy loss. But it's a bit
-        # suboptimal, since we can have a lot of states that would result in
-        # many kernel launches. Futhermore the parsing model's backprop expects
-        # a XP array, so we'd have to concat the softmaxes anyway. So, like
-        # the get_loss implementation, we'll compute the loss and gradients
-        # ourselves.
-
-        teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
-                                                axis=-1, inplace=True)
-        student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
-                                                axis=-1, inplace=True)
-
-        assert teacher_scores.shape == student_scores.shape
-
-        d_scores = student_scores - teacher_scores
-        if normalize:
-            d_scores /= d_scores.shape[0]
-        loss = (d_scores**2).sum() / d_scores.size
-
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
         return float(loss), d_scores
 
     def init_multitask_objectives(self, get_examples, pipeline, **cfg):
@@ -355,6 +371,9 @@ class Parser(TrainablePipe):
 
         stream: The sequence of documents to process.
         batch_size (int): Number of documents to accumulate into a working set.
+        error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
+            deals with a failing batch of documents. The default function just reraises
+            the exception.
 
         YIELDS (Doc): Documents, in order.
         """
@@ -375,29 +394,78 @@ class Parser(TrainablePipe):
     def predict(self, docs):
         if isinstance(docs, Doc):
             docs = [docs]
-        self._ensure_labels_are_added(docs)
         if not any(len(doc) for doc in docs):
             result = self.moves.init_batch(docs)
             return result
-        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
-            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-            states_or_beams, _ = self.model.predict(inputs)
-        return states_or_beams
+        if self.cfg["beam_width"] == 1:
+            return self.greedy_parse(docs, drop=0.0)
+        else:
+            return self.beam_parse(
+                docs,
+                drop=0.0,
+                beam_width=self.cfg["beam_width"],
+                beam_density=self.cfg["beam_density"]
+            )
 
     def greedy_parse(self, docs, drop=0.):
-        self._resize()
+        cdef vector[StateC*] states
+        cdef StateClass state
+        cdef CBlas cblas = self._cpu_ops.cblas()
         self._ensure_labels_are_added(docs)
-        with _change_attrs(self.model, beam_width=1):
-            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-            states, _ = self.model.predict(inputs)
-        return states
+        set_dropout_rate(self.model, drop)
+        batch = self.moves.init_batch(docs)
+        model = self.model.predict(docs)
+        weights = get_c_weights(model)
+        for state in batch:
+            if not state.is_final():
+                states.push_back(state.c)
+        sizes = get_c_sizes(model, states.size())
+        with nogil:
+            self._parseC(cblas, &states[0], weights, sizes)
+        model.clear_memory()
+        del model
+        return batch
 
     def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
+        cdef Beam beam
+        cdef Doc doc
         self._ensure_labels_are_added(docs)
-        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
-            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-            beams, _ = self.model.predict(inputs)
-        return beams
+        batch = _beam_utils.BeamBatch(
+            self.moves,
+            self.moves.init_batch(docs),
+            None,
+            beam_width,
+            density=beam_density
+        )
+        model = self.model.predict(docs)
+        while not batch.is_done:
+            states = batch.get_unfinished_states()
+            if not states:
+                break
+            scores = model.predict(states)
+            batch.advance(scores)
+        model.clear_memory()
+        del model
+        return list(batch)
+
+    cdef void _parseC(self, CBlas cblas, StateC** states,
+            WeightsC weights, SizesC sizes) nogil:
+        cdef int i, j
+        cdef vector[StateC*] unfinished
+        cdef ActivationsC activations = alloc_activations(sizes)
+        while sizes.states >= 1:
+            predict_states(cblas, &activations, states, &weights, sizes)
+            # Validate actions, argmax, take action.
+            self.c_transition_batch(states,
+                activations.scores, sizes.classes, sizes.states)
+            for i in range(sizes.states):
+                if not states[i].is_final():
+                    unfinished.push_back(states[i])
+            for i in range(unfinished.size()):
+                states[i] = unfinished[i]
+            sizes.states = unfinished.size()
+            unfinished.clear()
+        free_activations(&activations)
 
     def set_annotations(self, docs, states_or_beams):
         cdef StateClass state
@@ -408,6 +476,35 @@ class Parser(TrainablePipe):
             for hook in self.postprocesses:
                 hook(doc)
 
+    def transition_states(self, states, float[:, ::1] scores):
+        cdef StateClass state
+        cdef float* c_scores = &scores[0, 0]
+        cdef vector[StateC*] c_states
+        for state in states:
+            c_states.push_back(state.c)
+        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
+        return [state for state in states if not state.c.is_final()]
+
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
+            int nr_class, int batch_size) nogil:
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        with gil:
+            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
+        is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
+        cdef int i, guess
+        cdef Transition action
+        for i in range(batch_size):
+            self.moves.set_valid(is_valid, states[i])
+            guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
+            if guess == -1:
+                # This shouldn't happen, but it's hard to raise an error here,
+                # and we don't want to infinite loop. So, force to end state.
+                states[i].force_final()
+            else:
+                action = self.moves.c[guess]
+                action.do(states[i], action.label)
+        free(is_valid)
+
     def update(self, examples, *, drop=0., sgd=None, losses=None):
         if losses is None:
             losses = {}
@@ -418,99 +515,67 @@ class Parser(TrainablePipe):
         )
         for multitask in self._multitasks:
             multitask.update(examples, drop=drop, sgd=sgd)
-        # We need to take care to act on the whole batch, because we might be
-        # getting vectors via a listener.
         n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
         if n_examples == 0:
             return losses
         set_dropout_rate(self.model, drop)
-        docs = [eg.x for eg in examples if len(eg.x)]
-
+        # The probability we use beam update, instead of falling back to
+        # a greedy update
+        beam_update_prob = self.cfg["beam_update_prob"]
+        if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
+            return self.update_beam(
+                examples,
+                beam_width=self.cfg["beam_width"],
+                sgd=sgd,
+                losses=losses,
+                beam_density=self.cfg["beam_density"]
+            )
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
             # batch uniform length.
-            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
-            init_states, gold_states, _ = self._init_gold_batch(
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            states, golds, _ = self._init_gold_batch(
                 examples,
                 max_length=max_moves
             )
         else:
-            init_states, gold_states, _ = self.moves.init_gold_batch(examples)
-
-        inputs = TransitionModelInputs(docs=docs,
-                                       moves=self.moves,
-                                       max_moves=max_moves,
-                                       states=[state.copy() for state in init_states])
-        (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
-        if sum(s.shape[0] for s in scores) == 0:
+            states, golds, _ = self.moves.init_gold_batch(examples)
+        if not states:
             return losses
-        d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
-                                 examples, max_moves)
-        backprop_scores((pred_states, d_scores))
+        model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
+ 
+        all_states = list(states)
+        states_golds = list(zip(states, golds))
+        n_moves = 0
+        while states_golds:
+            states, golds = zip(*states_golds)
+            scores, backprop = model.begin_update(states)
+            d_scores = self.get_batch_loss(states, golds, scores, losses)
+            # Note that the gradient isn't normalized by the batch size
+            # here, because our "samples" are really the states...But we
+            # can't normalize by the number of states either, as then we'd
+            # be getting smaller gradients for states in long sequences.
+            backprop(d_scores)
+            # Follow the predicted action
+            self.transition_states(states, scores)
+            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
+            if max_moves >= 1 and n_moves >= max_moves:
+                break
+            n_moves += 1
+
+        backprop_tok2vec(golds)
         if sgd not in (None, False):
             self.finish_update(sgd)
-        losses[self.name] += float((d_scores**2).sum())
         # Ugh, this is annoying. If we're working on GPU, we want to free the
         # memory ASAP. It seems that Python doesn't necessarily get around to
         # removing these in time if we don't explicitly delete? It's confusing.
-        del backprop_scores
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        del model
         return losses
 
-    def get_loss(self, states_scores, examples, max_moves):
-        gold_states, init_states, pred_states, scores = states_scores
-        scores = self.model.ops.xp.vstack(scores)
-        costs = self._get_costs_from_histories(
-            examples,
-            gold_states,
-            init_states,
-            [list(state.history) for state in pred_states],
-            max_moves
-        )
-        xp = get_array_module(scores)
-        best_costs = costs.min(axis=1, keepdims=True)
-        gscores = scores.copy()
-        min_score = scores.min() - 1000
-        assert costs.shape == scores.shape, (costs.shape, scores.shape)
-        gscores[costs > best_costs] = min_score
-        max_ = scores.max(axis=1, keepdims=True)
-        gmax = gscores.max(axis=1, keepdims=True)
-        exp_scores = xp.exp(scores - max_)
-        exp_gscores = xp.exp(gscores - gmax)
-        Z = exp_scores.sum(axis=1, keepdims=True)
-        gZ = exp_gscores.sum(axis=1, keepdims=True)
-        d_scores = exp_scores / Z
-        d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
-        return d_scores
-
-    def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
-        cdef TransitionSystem moves = self.moves
-        cdef StateClass state
-        cdef int clas
-        cdef int nO = moves.n_moves
-        cdef Pool mem = Pool()
-        cdef np.ndarray costs_i
-        is_valid = <int*>mem.alloc(nO, sizeof(int))
-        batch = list(zip(init_states, histories, gold_states))
-        n_moves = 0
-        output = []
-        while batch:
-            costs = numpy.zeros((len(batch), nO), dtype="f")
-            for i, (state, history, gold) in enumerate(batch):
-                costs_i = costs[i]
-                clas = history.pop(0)
-                moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
-                action = moves.c[clas]
-                action.do(state.c, action.label)
-                state.c.history.push_back(clas)
-            output.append(costs)
-            batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
-            if n_moves >= max_moves >= 1:
-                break
-            n_moves += 1
-
-        return self.model.ops.xp.vstack(output)
-
     def rehearse(self, examples, sgd=None, losses=None, **cfg):
         """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
         if losses is None:
@@ -520,9 +585,10 @@ class Parser(TrainablePipe):
                 multitask.rehearse(examples, losses=losses, sgd=sgd)
         if self._rehearsal_model is None:
             return None
-        losses.setdefault(self.name, 0.0)
+        losses.setdefault(self.name, 0.)
         validate_examples(examples, "Parser.rehearse")
         docs = [eg.predicted for eg in examples]
+        states = self.moves.init_batch(docs)
         # This is pretty dirty, but the NER can resize itself in init_batch,
         # if labels are missing. We therefore have to check whether we need to
         # expand our model output.
@@ -530,33 +596,85 @@ class Parser(TrainablePipe):
         # Prepare the stepwise model, and get the callback for finishing the batch
         set_dropout_rate(self._rehearsal_model, 0.0)
         set_dropout_rate(self.model, 0.0)
-        student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
-        teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
-        _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
-
-        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
-
-        teacher_scores = self.model.ops.xp.vstack(teacher_scores)
-        student_scores = self.model.ops.xp.vstack(student_scores)
-        assert teacher_scores.shape == student_scores.shape
-
-        d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
-        # If all weights for an output are 0 in the original model, don't
-        # supervise that output. This allows us to add classes.
-        loss = (d_scores**2).sum() / d_scores.size
-        backprop_scores((student_states, d_scores))
-
+        tutor, _ = self._rehearsal_model.begin_update(docs)
+        model, backprop_tok2vec = self.model.begin_update(docs)
+        n_scores = 0.
+        loss = 0.
+        while states:
+            targets, _ = tutor.begin_update(states)
+            guesses, backprop = model.begin_update(states)
+            d_scores = (guesses - targets) / targets.shape[0]
+            # If all weights for an output are 0 in the original model, don't
+            # supervise that output. This allows us to add classes.
+            loss += (d_scores**2).sum()
+            backprop(d_scores)
+            # Follow the predicted action
+            self.transition_states(states, guesses)
+            states = [state for state in states if not state.is_final()]
+            n_scores += d_scores.size
+        # Do the backprop
+        backprop_tok2vec(docs)
         if sgd is not None:
             self.finish_update(sgd)
-        losses[self.name] += loss
-
+        losses[self.name] += loss / n_scores
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        tutor.clear_memory()
+        del model
+        del tutor
         return losses
 
-    def update_beam(self, examples, *, beam_width, drop=0.,
-                    sgd=None, losses=None, beam_density=0.0):
-        raise NotImplementedError
+    def update_beam(self, examples, *, beam_width,
+            drop=0., sgd=None, losses=None, beam_density=0.0):
+        states, golds, _ = self.moves.init_gold_batch(examples)
+        if not states:
+            return losses
+        # Prepare the stepwise model, and get the callback for finishing the batch
+        model, backprop_tok2vec = self.model.begin_update(
+            [eg.predicted for eg in examples])
+        loss = _beam_utils.update_beam(
+            self.moves,
+            states,
+            golds,
+            model,
+            beam_width,
+            beam_density=beam_density,
+        )
+        losses[self.name] += loss
+        backprop_tok2vec(golds)
+        if sgd is not None:
+            self.finish_update(sgd)
+
+    def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
+        cdef StateClass state
+        cdef Pool mem = Pool()
+        cdef int i
+
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
+
+        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
+        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
+        cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
+                                        dtype='f', order='C')
+        c_d_scores = <float*>d_scores.data
+        unseen_classes = self.model.attrs["unseen_classes"]
+        for i, (state, gold) in enumerate(zip(states, golds)):
+            memset(is_valid, 0, self.moves.n_moves * sizeof(int))
+            memset(costs, 0, self.moves.n_moves * sizeof(float))
+            self.moves.set_costs(is_valid, costs, state.c, gold)
+            for j in range(self.moves.n_moves):
+                if costs[j] <= 0.0 and j in unseen_classes:
+                    unseen_classes.remove(j)
+            cpu_log_loss(c_d_scores,
+                costs, is_valid, &scores[i, 0], d_scores.shape[1])
+            c_d_scores += d_scores.shape[1]
+        # Note that we don't normalize this. See comment in update() for why.
+        if losses is not None:
+            losses.setdefault(self.name, 0.)
+            losses[self.name] += (d_scores**2).sum()
+        return d_scores
 
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
@@ -595,7 +713,7 @@ class Parser(TrainablePipe):
             for example in islice(get_examples(), 10):
                 doc_sample.append(example.predicted)
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize((doc_sample, self.moves))
+        self.model.initialize(doc_sample)
         if nlp is not None:
             self.init_multitask_objectives(get_examples, nlp.pipeline)
 
@@ -688,27 +806,26 @@ class Parser(TrainablePipe):
 
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
-        sequence or a cap. A long doc will get multiple states. Let's say we
-        have a doc of length 2*N, where N is the shortest doc. We'll make
-        two states, one representing long_doc[:N], and another representing
-        long_doc[N:]."""
+        sequence or a cap. A long
+        doc will get multiple states. Let's say we have a doc of length 2*N,
+        where N is the shortest doc. We'll make two states, one representing
+        long_doc[:N], and another representing long_doc[N:]."""
         cdef:
             StateClass start_state
             StateClass state
             Transition action
-            TransitionSystem moves = self.moves
-        all_states = moves.init_batch([eg.predicted for eg in examples])
+        all_states = self.moves.init_batch([eg.predicted for eg in examples])
         states = []
         golds = []
         to_cut = []
         for state, eg in zip(all_states, examples):
-            if moves.has_gold(eg) and not state.is_final():
-                gold = moves.init_gold(state, eg)
+            if self.moves.has_gold(eg) and not state.is_final():
+                gold = self.moves.init_gold(state, eg)
                 if len(eg.x) < max_length:
                     states.append(state)
                     golds.append(gold)
                 else:
-                    oracle_actions = moves.get_oracle_sequence_from_state(
+                    oracle_actions = self.moves.get_oracle_sequence_from_state(
                         state.copy(), gold)
                     to_cut.append((eg, state, gold, oracle_actions))
         if not to_cut:
@@ -718,52 +835,13 @@ class Parser(TrainablePipe):
             for i in range(0, len(oracle_actions), max_length):
                 start_state = state.copy()
                 for clas in oracle_actions[i:i+max_length]:
-                    action = moves.c[clas]
+                    action = self.moves.c[clas]
                     action.do(state.c, action.label)
                     if state.is_final():
                         break
-                if moves.has_gold(eg, start_state.B(0), state.B(0)):
+                if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
                     states.append(start_state)
                     golds.append(gold)
                 if state.is_final():
                     break
         return states, golds, max_length
-
-
-@contextlib.contextmanager
-def _change_attrs(model, **kwargs):
-    """Temporarily modify a thinc model's attributes."""
-    unset = object()
-    old_attrs = {}
-    for key, value in kwargs.items():
-        old_attrs[key] = model.attrs.get(key, unset)
-        model.attrs[key] = value
-    yield model
-    for key, value in old_attrs.items():
-        if value is unset:
-            model.attrs.pop(key)
-        else:
-            model.attrs[key] = value
-
-
-def states2actions(states: List[StateClass]) -> List[Ints1d]:
-    cdef int step
-    cdef StateClass state
-    cdef StateC* c_state
-    actions = []
-    while True:
-        step = len(actions)
-
-        step_actions = []
-        for state in states:
-            c_state = state.c
-            if step < c_state.history.size():
-                step_actions.append(c_state.history[step])
-
-        # We are done if we have exhausted all histories.
-        if len(step_actions) == 0:
-            break
-
-        actions.append(numpy.array(step_actions, dtype="i"))
-
-    return actions
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index bb9b7653ce3..5d5fbb02790 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,6 +17,7 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
+import logging
 
 from ..util import make_tempdir
 
@@ -413,7 +414,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.initialize(get_examples=lambda: train_examples)
+    nlp.initialize()
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -540,11 +541,11 @@ def test_block_ner():
     assert [token.ent_type_ for token in doc] == expected_types
 
 
-def test_overfitting_IO():
-    fix_random_seed(1)
+@pytest.mark.parametrize("use_upper", [True, False])
+def test_overfitting_IO(use_upper):
     # Simple test to try and quickly overfit the NER component
     nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {}})
+    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -576,6 +577,7 @@ def test_overfitting_IO():
         assert ents2[0].label_ == "LOC"
         # Ensure that the predictions are still the same, even after adding a new label
         ner2 = nlp2.get_pipe("ner")
+        assert ner2.model.attrs["has_upper"] == use_upper
         ner2.add_label("RANDOM_NEW_LABEL")
         doc3 = nlp2(test_text)
         ents3 = doc3.ents
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index d25eb165acb..42cf5ced998 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,6 +1,3 @@
-import itertools
-
-import numpy
 import pytest
 from numpy.testing import assert_equal
 from thinc.api import Adam, fix_random_seed
@@ -62,8 +59,6 @@
     ),
 ]
 
-PARSERS = ["parser"]  # TODO: Test beam_parser when ready
-
 eps = 0.1
 
 
@@ -176,57 +171,6 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
     assert doc[0].dep != 0
 
 
-def test_parser_apply_actions(en_vocab, en_parser):
-    words = ["I", "ate", "pizza"]
-    words2 = ["Eat", "more", "pizza", "!"]
-    doc1 = Doc(en_vocab, words=words)
-    doc2 = Doc(en_vocab, words=words2)
-    docs = [doc1, doc2]
-
-    moves = en_parser.moves
-    moves.add_action(0, "")
-    moves.add_action(1, "")
-    moves.add_action(2, "nsubj")
-    moves.add_action(3, "obj")
-    moves.add_action(2, "amod")
-
-    actions = [
-        numpy.array([0, 0], dtype="i"),
-        numpy.array([2, 0], dtype="i"),
-        numpy.array([0, 4], dtype="i"),
-        numpy.array([3, 3], dtype="i"),
-        numpy.array([1, 1], dtype="i"),
-        numpy.array([1, 1], dtype="i"),
-        numpy.array([0], dtype="i"),
-        numpy.array([1], dtype="i"),
-    ]
-
-    states = moves.init_batch(docs)
-    active_states = states
-
-    for step_actions in actions:
-        active_states = moves.apply_actions(active_states, step_actions)
-
-    assert len(active_states) == 0
-
-    for (state, doc) in zip(states, docs):
-        moves.set_annotations(state, doc)
-
-    assert docs[0][0].head.i == 1
-    assert docs[0][0].dep_ == "nsubj"
-    assert docs[0][1].head.i == 1
-    assert docs[0][1].dep_ == "ROOT"
-    assert docs[0][2].head.i == 1
-    assert docs[0][2].dep_ == "obj"
-
-    assert docs[1][0].head.i == 0
-    assert docs[1][0].dep_ == "ROOT"
-    assert docs[1][1].head.i == 2
-    assert docs[1][1].dep_ == "amod"
-    assert docs[1][2].head.i == 0
-    assert docs[1][2].dep_ == "obj"
-
-
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
@@ -375,7 +319,7 @@ def test_parser_constructor(en_vocab):
     DependencyParser(en_vocab, model)
 
 
-@pytest.mark.parametrize("pipe_name", PARSERS)
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
 def test_incomplete_data(pipe_name):
     # Test that the parser works with incomplete information
     nlp = English()
@@ -401,15 +345,11 @@ def test_incomplete_data(pipe_name):
     assert doc[2].head.i == 1
 
 
-@pytest.mark.parametrize(
-    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
-)
-def test_overfitting_IO(pipe_name, max_moves):
-    fix_random_seed(0)
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+def test_overfitting_IO(pipe_name):
     # Simple test to try and quickly overfit the dependency parser (normal or beam)
     nlp = English()
     parser = nlp.add_pipe(pipe_name)
-    parser.cfg["update_with_oracle_cut_size"] = max_moves
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -511,12 +451,10 @@ def test_distill():
 @pytest.mark.parametrize(
     "parser_config",
     [
-        # TODO: re-enable after we have a spacy-legacy release for v4. See
-        # https://github.com/explosion/spacy-legacy/pull/36
-        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        # TransitionBasedParser V1
+        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        # TransitionBasedParser V2
         ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
-        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
     ],
 )
 # fmt: on
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index f6cefbc1f84..c3c4bb6c686 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -384,7 +384,7 @@ def test_replace_listeners():
     factory = "ner"
 
     [components.ner.model]
-    @architectures = "spacy.TransitionBasedParser.v3"
+    @architectures = "spacy.TransitionBasedParser.v2"
 
     [components.ner.model.tok2vec]
     @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index b351ea80121..8a1c74ca9ed 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -189,11 +189,33 @@
 
 parser_config_string_upper = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
+use_upper = true
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 333
+depth = 4
+embed_size = 5555
+window_size = 1
+maxout_pieces = 7
+subword_features = false
+"""
+
+
+parser_config_string_no_upper = """
+[model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "parser"
+extra_state_tokens = false
+hidden_width = 66
+maxout_pieces = 2
+use_upper = false
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
@@ -224,6 +246,7 @@ def my_parser():
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
+        use_upper=True,
     )
     return parser
 
@@ -337,16 +360,15 @@ def test_serialize_custom_nlp():
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        assert model.get_ref("tok2vec") is not None
-        assert model.has_param("hidden_W")
-        assert model.has_param("hidden_b")
-        output = model.get_ref("output")
-        assert output is not None
-        assert output.has_param("W")
-        assert output.has_param("b")
+        model.get_ref("tok2vec")
+        # check that we have the correct settings, not the default ones
+        assert model.get_ref("upper").get_dim("nI") == 65
+        assert model.get_ref("lower").get_dim("nI") == 65
 
 
-@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
+@pytest.mark.parametrize(
+    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
+)
 def test_serialize_parser(parser_config_string):
     """Create a non-default parser config to check nlp serializes it correctly"""
     nlp = English()
@@ -359,13 +381,11 @@ def test_serialize_parser(parser_config_string):
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        assert model.get_ref("tok2vec") is not None
-        assert model.has_param("hidden_W")
-        assert model.has_param("hidden_b")
-        output = model.get_ref("output")
-        assert output is not None
-        assert output.has_param("b")
-        assert output.has_param("W")
+        model.get_ref("tok2vec")
+        # check that we have the correct settings, not the default ones
+        if model.attrs["has_upper"]:
+            assert model.get_ref("upper").get_dim("nI") == 66
+        assert model.get_ref("lower").get_dim("nI") == 66
 
 
 def test_config_nlp_roundtrip():
@@ -561,7 +581,9 @@ def test_config_auto_fill_extra_fields():
     load_model_from_config(nlp.config)
 
 
-@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
+@pytest.mark.parametrize(
+    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
+)
 def test_config_validate_literal(parser_config_string):
     nlp = English()
     config = Config().from_str(parser_config_string)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index c05ef625e11..44717f6eba2 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,19 +1,22 @@
 import ctypes
 import os
 from pathlib import Path
-
 import pytest
-from pydantic import ValidationError
-from thinc.api import (
-    Config,
-    ConfigValidationError,
-    CupyOps,
-    MPSOps,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-    set_current_ops,
-)
+
+try:
+    from pydantic.v1 import ValidationError
+except ImportError:
+    from pydantic import ValidationError  # type: ignore
+
+from spacy.about import __version__ as spacy_version
+from spacy import util
+from spacy import prefer_gpu, require_gpu, require_cpu
+from spacy.ml._precomputable_affine import PrecomputableAffine
+from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
+from spacy.util import dot_to_object, SimpleFrozenList, import_file
+from spacy.util import to_ternary_int, find_available_port
+from thinc.api import Config, Optimizer, ConfigValidationError
+from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
@@ -96,6 +99,34 @@ def test_util_get_package_path(package):
     assert isinstance(path, Path)
 
 
+def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
+    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
+    assert model.get_param("W").shape == (nF, nO, nP, nI)
+    tensor = model.ops.alloc((10, nI))
+    Y, get_dX = model.begin_update(tensor)
+    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
+    dY = model.ops.alloc((15, nO, nP))
+    ids = model.ops.alloc((15, nF))
+    ids[1, 2] = -1
+    dY[1] = 1
+    assert not model.has_grad("pad")
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 2, 0, 0] == 1.0
+    ids.fill(0.0)
+    dY.fill(0.0)
+    dY[0] = 0
+    ids[1, 2] = 0
+    ids[1, 1] = -1
+    ids[1, 0] = -1
+    dY[1] = 1
+    ids[2, 0] = -1
+    dY[2] = 5
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 0, 0, 0] == 6
+    assert d_pad[0, 1, 0, 0] == 1
+    assert d_pad[0, 2, 0, 0] == 0
+
+
 def test_prefer_gpu():
     current_ops = get_current_ops()
     if has_cupy_gpu:
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 914e877f579..8217de5bfe7 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,6 +1,6 @@
 # cython: profile=False
 from collections.abc import Iterable as IterableInstance
-
+import warnings
 import numpy
 
 from murmurhash.mrmr cimport hash64
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index db8f974ea19..956234ac0d4 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -833,17 +833,18 @@ for a Tok2Vec layer.
 
 ## Parser & NER architectures {id="parser"}
 
-### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
+### spacy.TransitionBasedParser.v2 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TransitionBasedParser.v3"
+> @architectures = "spacy.TransitionBasedParser.v2"
 > state_type = "ner"
 > extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
+> use_upper = true
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v2"
@@ -873,22 +874,23 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                 | Description                                                                                                                                                       |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                        |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                               |
-| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
-| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                            |
-| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~                                                             |
-| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                       |
-| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                     |
+| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
+| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
 
 <Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>
 
 [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
 same signature, but the `use_upper` argument was `True` by default.
 
- </Accordion>
+</Accordion>
 
 ## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 765bcb8c675..1fae1dc6cda 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -406,7 +406,7 @@ Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v3
+Name       spacy.TransitionBasedParser.v1
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
@@ -416,7 +416,7 @@ Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v3
+Name       spacy.TransitionBasedParser.v1
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]
@@ -741,7 +741,7 @@ scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 - hidden_width = 64
@@ -764,7 +764,7 @@ scorer = {"@scorers":"spacy.parser_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx
index 44c80622437..b44df538766 100644
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@@ -302,7 +302,7 @@ the others, but may not be as accurate, especially if texts are short.
 ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 
 Identical to
-[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser)
+[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
 except the `use_upper` was set to `True` by default.
 
 ## Layers {id="layers"}
diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 1b0bc9606e9..2bd2856b6a3 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -140,7 +140,7 @@ factory = "tok2vec"
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v1"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -156,7 +156,7 @@ same. This makes them fully independent and doesn't require an upstream
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v1"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
@@ -472,7 +472,7 @@ sneakily delegates to the `Transformer` pipeline component.
 factory = "ner"
 
 [nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v1"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 128

From c9f810748c37bf8afc1fc08f1957198e834852dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:24:09 +0100
Subject: [PATCH 086/504] isort

---
 spacy/ml/models/parser.py                   |  9 +++--
 spacy/ml/parser_model.pxd                   |  5 ++-
 spacy/ml/parser_model.pyx                   |  7 ++--
 spacy/ml/tb_framework.py                    |  3 +-
 spacy/pipeline/_parser_internals/_state.pxd |  3 +-
 spacy/pipeline/dep_parser.pyx               |  3 +-
 spacy/pipeline/ner.pyx                      |  9 +++--
 spacy/pipeline/transition_parser.pxd        |  6 +--
 spacy/pipeline/transition_parser.pyx        | 45 +++++++++++++--------
 spacy/tests/parser/test_ner.py              |  1 -
 spacy/tests/test_misc.py                    | 20 ++++-----
 spacy/training/example.pyx                  |  4 +-
 12 files changed, 67 insertions(+), 48 deletions(-)

diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index a70d84dea8f..f6c0e565dd3 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,13 +1,14 @@
-from typing import Optional, List, cast
-from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
+from typing import List, Optional, cast
+
+from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
 from thinc.types import Floats2d
 
-from ...errors import Errors
 from ...compat import Literal
+from ...errors import Errors
+from ...tokens import Doc
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
-from ...tokens import Doc
 
 
 @registry.architectures("spacy.TransitionBasedParser.v2")
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
index 8def6cea53f..ca31c169964 100644
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@@ -1,7 +1,8 @@
-from libc.string cimport memset, memcpy
+from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport CBlas
-from ..typedefs cimport weight_t, hash_t
+
 from ..pipeline._parser_internals._state cimport StateC
+from ..typedefs cimport hash_t, weight_t
 
 
 cdef struct SizesC:
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 91558683b60..90e836f8a0a 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -1,18 +1,19 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
 cimport numpy as np
 from libc.math cimport exp
-from libc.string cimport memset, memcpy
 from libc.stdlib cimport calloc, free, realloc
+from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport saxpy, sgemm
 
 import numpy
 import numpy.random
-from thinc.api import Model, CupyOps, NumpyOps, get_ops
+from thinc.api import CupyOps, Model, NumpyOps, get_ops
 
 from .. import util
 from ..errors import Errors
-from ..typedefs cimport weight_t, class_t, hash_t
+
 from ..pipeline._parser_internals.stateclass cimport StateClass
+from ..typedefs cimport class_t, hash_t, weight_t
 
 
 cdef WeightsC get_c_weights(model) except *:
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
index ab4a969e24e..e351ad4e570 100644
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@@ -1,6 +1,7 @@
 from thinc.api import Model, noop
-from .parser_model import ParserStepModel
+
 from ..util import registry
+from .parser_model import ParserStepModel
 
 
 @registry.layers("spacy.TransitionModel.v1")
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 04274ce8af1..c063cf97cd4 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,4 +1,5 @@
 cimport libcpp
+from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as incr
 from libc.stdint cimport uint32_t, uint64_t
@@ -7,8 +8,6 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.set cimport set
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 3e59deaae4f..1fdc55dab41 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -5,8 +5,9 @@ from typing import Callable, Iterable, Optional
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
+
 from ._parser_internals.arc_eager cimport ArcEager
+from .transition_parser cimport Parser
 
 from ..language import Language
 from ..scorer import Scorer
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index c73ec5c528a..29f22b0174d 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -10,13 +10,14 @@ from ..training import remove_bilu_prefix, validate_examples
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
+
 from ._parser_internals.ner cimport BiluoPushDown
+from .transition_parser cimport Parser
+
 from ..language import Language
-from ..scorer import get_ner_prf, PRFScore
-from ..util import registry
+from ..scorer import PRFScore, get_ner_prf
 from ..training import remove_bilu_prefix
-
+from ..util import registry
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index f20e69a6e56..a48d76b6819 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -1,11 +1,11 @@
 from cymem.cymem cimport Pool
 from thinc.backends.cblas cimport CBlas
 
+from ..ml.parser_model cimport ActivationsC, SizesC, WeightsC
 from ..vocab cimport Vocab
-from .trainable_pipe cimport TrainablePipe
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
 from ._parser_internals._state cimport StateC
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from .trainable_pipe cimport TrainablePipe
 
 
 cdef class Parser(TrainablePipe):
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 4290420c788..2fb3af44ddf 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,21 +1,20 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
+
 from typing import Dict, Iterable, List, Optional, Tuple
-from cymem.cymem cimport Pool
+
 cimport numpy as np
+from cymem.cymem cimport Pool
+
 from itertools import islice
-from libcpp.vector cimport vector
-from libc.string cimport memset, memcpy
+
 from libc.stdlib cimport calloc, free
+from libc.string cimport memcpy, memset
+from libcpp.vector cimport vector
+
 import random
 
-import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
-from thinc.api import chain, softmax_activation, use_ops
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d
-import numpy.random
 import numpy
 import numpy.random
 import srsly
@@ -23,21 +22,36 @@ from thinc.api import (
     CupyOps,
     NumpyOps,
     Optimizer,
+    chain,
     get_array_module,
     get_ops,
     set_dropout_rate,
+    softmax_activation,
+    use_ops,
 )
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 
-from ._parser_internals.stateclass cimport StateClass
-from ._parser_internals.search cimport Beam
-from ..ml.parser_model cimport alloc_activations, free_activations
-from ..ml.parser_model cimport predict_states, arg_max_if_valid
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
-from ..ml.parser_model cimport get_c_weights, get_c_sizes
+from ..ml.parser_model cimport (
+    ActivationsC,
+    SizesC,
+    WeightsC,
+    alloc_activations,
+    arg_max_if_valid,
+    cpu_log_loss,
+    free_activations,
+    get_c_sizes,
+    get_c_weights,
+    predict_states,
+)
 from ..tokens.doc cimport Doc
+from ._parser_internals.search cimport Beam
+from ._parser_internals.stateclass cimport StateClass
+
 from .trainable_pipe import TrainablePipe
+
 from ._parser_internals cimport _beam_utils
+
 from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
@@ -62,7 +76,6 @@ cdef extern from "<algorithm>" namespace "std" nogil:
     bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
 
 
-
 NUMPY_OPS = NumpyOps()
 
 
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 5d5fbb02790..b6848d380fe 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,7 +17,6 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
-import logging
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 44717f6eba2..d2a41ff0fed 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,6 +1,7 @@
 import ctypes
 import os
 from pathlib import Path
+
 import pytest
 
 try:
@@ -8,15 +9,16 @@
 except ImportError:
     from pydantic import ValidationError  # type: ignore
 
-from spacy.about import __version__ as spacy_version
-from spacy import util
-from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.ml._precomputable_affine import PrecomputableAffine
-from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
-from spacy.util import dot_to_object, SimpleFrozenList, import_file
-from spacy.util import to_ternary_int, find_available_port
-from thinc.api import Config, Optimizer, ConfigValidationError
-from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
+from thinc.api import (
+    Config,
+    ConfigValidationError,
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+    set_current_ops,
+)
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 8217de5bfe7..e41f9e02eb3 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,6 +1,6 @@
-# cython: profile=False
-from collections.abc import Iterable as IterableInstance
 import warnings
+from collections.abc import Iterable as IterableInstance
+
 import numpy
 
 from murmurhash.mrmr cimport hash64

From 02fdc8f12261d8ba1e106b863a53a1d9e208e2fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:38:01 +0100
Subject: [PATCH 087/504] Add distillation tests with max cut size

And fix endless loop when the max cut size is 0 or 1.
---
 spacy/pipeline/transition_parser.pyx | 2 +-
 spacy/tests/parser/test_ner.py       | 5 ++++-
 spacy/tests/parser/test_parse.py     | 5 ++++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 2fb3af44ddf..17a4fdb1b93 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -302,7 +302,7 @@ cdef class Parser(TrainablePipe):
             # batch uniform length. Since we do not have a gold standard
             # sequence, we use the teacher's predictions as the gold
             # standard.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
             states = self._init_batch(teacher_step_model, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index b6848d380fe..7c3a9d56249 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -624,7 +624,9 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -642,6 +644,7 @@ def test_distill():
 
     student = English()
     student_ner = student.add_pipe("ner")
+    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 42cf5ced998..dbede7edd52 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -402,7 +402,9 @@ def test_is_distillable():
     assert parser.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []
@@ -420,6 +422,7 @@ def test_distill():
 
     student = English()
     student_parser = student.add_pipe("parser")
+    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
     student_parser.initialize(
         get_examples=lambda: train_examples, labels=teacher_parser.label_data
     )

From 980acd22424f04dc2e62b1c26e1864044bc21ceb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 18 Dec 2023 20:02:15 +0100
Subject: [PATCH 088/504] Fix Cython lints

---
 spacy/ml/parser_model.pxd                |  9 ++--
 spacy/ml/parser_model.pyx                | 64 ++++++++++++------------
 spacy/pipeline/_parser_internals/ner.pyx |  4 +-
 spacy/pipeline/dep_parser.pyx            |  1 +
 spacy/pipeline/ner.pyx                   |  3 +-
 spacy/pipeline/transition_parser.pxd     |  4 +-
 spacy/pipeline/transition_parser.pyx     | 42 ++++++----------
 spacy/training/example.pyx               |  1 -
 8 files changed, 58 insertions(+), 70 deletions(-)

diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
index ca31c169964..88386255147 100644
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@@ -41,10 +41,9 @@ cdef ActivationsC alloc_activations(SizesC n) nogil
 cdef void free_activations(const ActivationsC* A) nogil
 
 cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil
- 
+                         const WeightsC* W, SizesC n) nogil
+
 cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
 
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores, int O) nogil
- 
+cdef void cpu_log_loss(float* d_scores, const float* costs,
+                       const int* is_valid, const float* scores, int O) nogil
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 90e836f8a0a..843275f4c8b 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -13,7 +13,7 @@ from .. import util
 from ..errors import Errors
 
 from ..pipeline._parser_internals.stateclass cimport StateClass
-from ..typedefs cimport class_t, hash_t, weight_t
+from ..typedefs cimport weight_t
 
 
 cdef WeightsC get_c_weights(model) except *:
@@ -78,31 +78,31 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
         A._max_size = n.states
     else:
         A.token_ids = <int*>realloc(A.token_ids,
-            n.states * n.feats * sizeof(A.token_ids[0]))
+                                    n.states * n.feats * sizeof(A.token_ids[0]))
         A.scores = <float*>realloc(A.scores,
-            n.states * n.classes * sizeof(A.scores[0]))
+                                   n.states * n.classes * sizeof(A.scores[0]))
         A.unmaxed = <float*>realloc(A.unmaxed,
-            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
         A.hiddens = <float*>realloc(A.hiddens,
-            n.states * n.hiddens * sizeof(A.hiddens[0]))
+                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
         A.is_valid = <int*>realloc(A.is_valid,
-            n.states * n.classes * sizeof(A.is_valid[0]))
+                                   n.states * n.classes * sizeof(A.is_valid[0]))
         A._max_size = n.states
     A._curr_size = n.states
 
 
 cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil:
-    cdef double one = 1.0
+                         const WeightsC* W, SizesC n) nogil:
     resize_activations(A, n)
     for i in range(n.states):
         states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
     memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
     memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
-    sum_state_features(cblas, A.unmaxed,
-        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
+    sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n.states,
+                       n.feats, n.hiddens * n.pieces)
     for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1,
+                     &A.unmaxed[i*n.hiddens*n.pieces], 1)
         for j in range(n.hiddens):
             index = i * n.hiddens * n.pieces + j * n.pieces
             which = _arg_max(&A.unmaxed[index], n.pieces)
@@ -112,10 +112,10 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
         memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
     else:
         # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-            1.0, <const float *>A.hiddens, n.hiddens,
-            <const float *>W.hidden_weights, n.hiddens,
-            0.0, A.scores, n.classes)
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens, 1.0,
+                     <const float *>A.hiddens, n.hiddens,
+                     <const float *>W.hidden_weights, n.hiddens, 0.0,
+                     A.scores, n.classes)
         # Add bias
         for i in range(n.states):
             saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
@@ -131,9 +131,9 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
                 A.scores[i*n.classes+j] = min_
 
 
-cdef void sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, int B, int F, int O) nogil:
-    cdef int idx, b, f, i
+cdef void sum_state_features(CBlas cblas, float* output, const float* cached,
+                             const int* token_ids, int B, int F, int O) nogil:
+    cdef int idx, b, f
     cdef const float* feature
     padding = cached
     cached += F * O
@@ -150,9 +150,8 @@ cdef void sum_state_features(CBlas cblas, float* output,
         token_ids += F
 
 
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores,
-        int O) nogil:
+cdef void cpu_log_loss(float* d_scores, const float* costs, const int* is_valid,
+                       const float* scores, int O) nogil:
     """Do multi-label log loss"""
     cdef double max_, gmax, Z, gZ
     best = arg_max_if_gold(scores, costs, is_valid, O)
@@ -178,7 +177,7 @@ cdef void cpu_log_loss(float* d_scores,
 
 
 cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
-        const int* is_valid, int n) nogil:
+                         const int* is_valid, int n) nogil:
     # Find minimum cost
     cdef float cost = 1
     for i in range(n):
@@ -202,10 +201,9 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
     return best
 
 
-
 class ParserStepModel(Model):
     def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
-            dropout=0.1):
+                 dropout=0.1):
         Model.__init__(self, name="parser_step_model", forward=step_forward)
         self.attrs["has_upper"] = has_upper
         self.attrs["dropout_rate"] = dropout
@@ -267,7 +265,7 @@ class ParserStepModel(Model):
 
     def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
         if isinstance(self.state2vec.ops, CupyOps) \
-        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+           and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
             # Move token_ids and d_vector to GPU, asynchronously
             self.backprops.append((
                 util.get_async(self.cuda_stream, token_ids),
@@ -277,7 +275,6 @@ class ParserStepModel(Model):
         else:
             self.backprops.append((token_ids, d_vector, get_d_tokvecs))
 
-
     def finish_steps(self, golds):
         # Add a padding vector to the d_tokvecs gradient, so that missing
         # values don't affect the real gradient.
@@ -290,14 +287,15 @@ class ParserStepModel(Model):
             ids = ids.flatten()
             d_state_features = d_state_features.reshape(
                 (ids.size, d_state_features.shape[2]))
-            self.ops.scatter_add(d_tokvecs, ids,
-                d_state_features)
+            self.ops.scatter_add(d_tokvecs, ids, d_state_features)
         # Padded -- see update()
         self.bp_tokvecs(d_tokvecs[:-1])
         return d_tokvecs
 
+
 NUMPY_OPS = NumpyOps()
 
+
 def step_forward(model: ParserStepModel, states, is_train):
     token_ids = model.get_token_ids(states)
     vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
@@ -310,7 +308,7 @@ def step_forward(model: ParserStepModel, states, is_train):
         scores, get_d_vector = model.vec2scores(vector, is_train)
     else:
         scores = NumpyOps().asarray(vector)
-        get_d_vector = lambda d_scores: d_scores
+        def get_d_vector(d_scores): return d_scores
     # If the class is unseen, make sure its score is minimum
     scores[:, model._class_mask == 0] = numpy.nanmin(scores)
 
@@ -445,8 +443,8 @@ cdef class precompute_hiddens:
         feat_weights = self.get_feat_weights()
         cdef int[:, ::1] ids = token_ids
         sum_state_features(cblas, <float*>state_vector.data,
-            feat_weights, &ids[0,0],
-            token_ids.shape[0], self.nF, self.nO*self.nP)
+                           feat_weights, &ids[0, 0], token_ids.shape[0],
+                           self.nF, self.nO*self.nP)
         state_vector += self.bias
         state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
 
@@ -471,7 +469,7 @@ cdef class precompute_hiddens:
 
         def backprop_maxout(d_best):
             return self.ops.backprop_maxout(d_best, mask, self.nP)
-        
+
         return state_vector, backprop_maxout
 
     def _relu_nonlinearity(self, state_vector):
@@ -485,7 +483,7 @@ cdef class precompute_hiddens:
         def backprop_relu(d_best):
             d_best *= mask
             return d_best.reshape((d_best.shape + (1,)))
- 
+
         return state_vector, backprop_relu
 
 cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 0b9980ddbf2..be769bd9cd0 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -159,7 +159,7 @@ cdef class BiluoPushDown(TransitionSystem):
             if token.ent_type:
                 labels.add(token.ent_type_)
         return labels
-    
+
     def move_name(self, int move, attr_t label):
         if move == OUT:
             return 'O'
@@ -644,7 +644,7 @@ cdef class Unit:
                 cost += 1
                 break
         return cost
- 
+
 
 
 cdef class Out:
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 1fdc55dab41..cbd7187ff0f 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -127,6 +127,7 @@ def make_parser(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_parser",
     assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 29f22b0174d..fe54d33a17b 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -15,7 +15,7 @@ from ._parser_internals.ner cimport BiluoPushDown
 from .transition_parser cimport Parser
 
 from ..language import Language
-from ..scorer import PRFScore, get_ner_prf
+from ..scorer import get_ner_prf
 from ..training import remove_bilu_prefix
 from ..util import registry
 
@@ -105,6 +105,7 @@ def make_ner(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_ner",
     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index a48d76b6819..7adb82213de 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -15,7 +15,7 @@ cdef class Parser(TrainablePipe):
     cdef object _cpu_ops
 
     cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil
+                      WeightsC weights, SizesC sizes) nogil
 
     cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil
+                                 int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 17a4fdb1b93..fa9a76772ec 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -10,7 +10,7 @@ from cymem.cymem cimport Pool
 from itertools import islice
 
 from libc.stdlib cimport calloc, free
-from libc.string cimport memcpy, memset
+from libc.string cimport memset
 from libcpp.vector cimport vector
 
 import random
@@ -23,14 +23,13 @@ from thinc.api import (
     NumpyOps,
     Optimizer,
     chain,
-    get_array_module,
     get_ops,
     set_dropout_rate,
     softmax_activation,
     use_ops,
 )
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
+from thinc.types import Floats2d
 
 from ..ml.parser_model cimport (
     ActivationsC,
@@ -45,7 +44,6 @@ from ..ml.parser_model cimport (
     predict_states,
 )
 from ..tokens.doc cimport Doc
-from ._parser_internals.search cimport Beam
 from ._parser_internals.stateclass cimport StateClass
 
 from .trainable_pipe import TrainablePipe
@@ -55,11 +53,10 @@ from ._parser_internals cimport _beam_utils
 from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
-from ..typedefs cimport weight_t
 from ..vocab cimport Vocab
 from ._parser_internals cimport _beam_utils
 from ._parser_internals.stateclass cimport StateC, StateClass
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ._parser_internals.transition_system cimport Transition
 from .trainable_pipe cimport TrainablePipe
 
 from .. import util
@@ -295,7 +292,7 @@ cdef class Parser(TrainablePipe):
         with use_ops("numpy"):
             teacher_model = chain(teacher_step_model, softmax_activation())
             student_model = chain(student_step_model, softmax_activation())
-        
+
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
@@ -440,8 +437,6 @@ cdef class Parser(TrainablePipe):
         return batch
 
     def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
-        cdef Beam beam
-        cdef Doc doc
         self._ensure_labels_are_added(docs)
         batch = _beam_utils.BeamBatch(
             self.moves,
@@ -462,15 +457,15 @@ cdef class Parser(TrainablePipe):
         return list(batch)
 
     cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil:
-        cdef int i, j
+                      WeightsC weights, SizesC sizes) nogil:
+        cdef int i
         cdef vector[StateC*] unfinished
         cdef ActivationsC activations = alloc_activations(sizes)
         while sizes.states >= 1:
             predict_states(cblas, &activations, states, &weights, sizes)
             # Validate actions, argmax, take action.
-            self.c_transition_batch(states,
-                activations.scores, sizes.classes, sizes.states)
+            self.c_transition_batch(states, activations.scores,
+                                    sizes.classes, sizes.states)
             for i in range(sizes.states):
                 if not states[i].is_final():
                     unfinished.push_back(states[i])
@@ -499,7 +494,7 @@ cdef class Parser(TrainablePipe):
         return [state for state in states if not state.c.is_final()]
 
     cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil:
+                                 int nr_class, int batch_size) nogil:
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         with gil:
             assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
@@ -557,8 +552,7 @@ cdef class Parser(TrainablePipe):
         if not states:
             return losses
         model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
- 
-        all_states = list(states)
+
         states_golds = list(zip(states, golds))
         n_moves = 0
         while states_golds:
@@ -638,8 +632,8 @@ cdef class Parser(TrainablePipe):
         del tutor
         return losses
 
-    def update_beam(self, examples, *, beam_width,
-            drop=0., sgd=None, losses=None, beam_density=0.0):
+    def update_beam(self, examples, *, beam_width, drop=0., sgd=None,
+                    losses=None, beam_density=0.0):
         states, golds, _ = self.moves.init_gold_batch(examples)
         if not states:
             return losses
@@ -670,7 +664,7 @@ cdef class Parser(TrainablePipe):
         is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
         costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
         cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
-                                        dtype='f', order='C')
+                                               dtype='f', order='C')
         c_d_scores = <float*>d_scores.data
         unseen_classes = self.model.attrs["unseen_classes"]
         for i, (state, gold) in enumerate(zip(states, golds)):
@@ -680,8 +674,8 @@ cdef class Parser(TrainablePipe):
             for j in range(self.moves.n_moves):
                 if costs[j] <= 0.0 and j in unseen_classes:
                     unseen_classes.remove(j)
-            cpu_log_loss(c_d_scores,
-                costs, is_valid, &scores[i, 0], d_scores.shape[1])
+            cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0],
+                         d_scores.shape[1])
             c_d_scores += d_scores.shape[1]
         # Note that we don't normalize this. See comment in update() for why.
         if losses is not None:
@@ -791,10 +785,7 @@ cdef class Parser(TrainablePipe):
         long_doc[:N], and another representing long_doc[N:]. In contrast to
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
-        cdef:
-            StateClass start_state
-            StateClass state
-            Transition action
+        cdef StateClass state
         all_states = self.moves.init_batch(docs)
         states = []
         to_cut = []
@@ -816,7 +807,6 @@ cdef class Parser(TrainablePipe):
                 length += 1
         return states
 
-
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
         sequence or a cap. A long
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index e41f9e02eb3..efca4bcb03b 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,4 +1,3 @@
-import warnings
 from collections.abc import Iterable as IterableInstance
 
 import numpy

From e59089436e6f7091661085f2a7e29e4df8011099 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 18 Dec 2023 20:17:24 +0100
Subject: [PATCH 089/504] Bring back W401

---
 spacy/errors.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index a5d0b3d11a9..5d6d65e3b26 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -217,6 +217,11 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
+    # v4 warning strings
+    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
+            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
+            "to return `True` in `.supports_prior_probs`.")
+
 
 class Errors(metaclass=ErrorsWithCodes):
     E001 = ("No component '{name}' found in pipeline. Available names: {opts}")

From a20abb73bd5ba1d2d179ed370e205bd48e4e15d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 19 Dec 2023 09:28:20 +0100
Subject: [PATCH 090/504] Fix `TransitionBasedParser` version in transformer
 embeddings docs

---
 website/docs/usage/embeddings-transformers.mdx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 2bd2856b6a3..534cf478087 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -140,7 +140,7 @@ factory = "tok2vec"
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -156,7 +156,7 @@ same. This makes them fully independent and doesn't require an upstream
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
@@ -472,7 +472,7 @@ sneakily delegates to the `Transformer` pipeline component.
 factory = "ner"
 
 [nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 128

From b1bf2cc13cff814c6bb7af57a41f86717d99b6fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 09:47:38 +0100
Subject: [PATCH 091/504] No need for `Literal` compat, since we only support
 >= 3.8

---
 spacy/compat.py           | 5 -----
 spacy/errors.py           | 1 -
 spacy/ml/models/parser.py | 3 +--
 3 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/spacy/compat.py b/spacy/compat.py
index 30459e2e495..1e63807a0e8 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,11 +23,6 @@
 except ImportError:
     cupy = None
 
-if sys.version_info[:2] >= (3, 8):  # Python 3.8+
-    from typing import Literal, Protocol, runtime_checkable
-else:
-    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
-
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/errors.py b/spacy/errors.py
index 5d6d65e3b26..1af8a3b0891 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1006,7 +1006,6 @@ class Errors(metaclass=ErrorsWithCodes):
     E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
-
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
 # fmt: on
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index f6c0e565dd3..e776174f6ed 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,9 +1,8 @@
-from typing import List, Optional, cast
+from typing import List, Literal, Optional
 
 from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
 from thinc.types import Floats2d
 
-from ...compat import Literal
 from ...errors import Errors
 from ...tokens import Doc
 from ...util import registry

From 8a1c1982a3760d1e875ab87d712ab2a0e4f8a780 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 10:06:28 +0100
Subject: [PATCH 092/504] Fix parser distillation test seed

The test would sometimes fail. Rather than increasing test by increasing
training iterations, use a known-good seed.
---
 spacy/tests/parser/test_parse.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index dbede7edd52..f63d56f6922 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -405,6 +405,7 @@ def test_is_distillable():
 @pytest.mark.slow
 @pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
 def test_distill(max_moves):
+    fix_random_seed(0)
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []

From 1b315381a292f156ed9470b442492004e218963a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 11:14:35 +0100
Subject: [PATCH 093/504] TransitionBasedParser.v2 in run example output

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/docs/api/cli.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 1fae1dc6cda..cfa99a2b350 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -406,7 +406,7 @@ Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v2
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
@@ -416,7 +416,7 @@ Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v2
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]

From ffc64c529e0fe39d1194c5fa3f039bcfda292d34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 16 Jan 2024 14:54:26 +0100
Subject: [PATCH 094/504] Update thinc dependency to 9.0.0.dev4

---
 pyproject.toml                         |  2 +-
 requirements.txt                       |  2 +-
 setup.cfg                              | 10 +++++++++-
 spacy/pipeline/edit_tree_lemmatizer.py |  7 ++-----
 spacy/pipeline/morphologizer.pyx       |  7 +++----
 spacy/pipeline/senter.pyx              |  7 ++-----
 spacy/pipeline/tagger.pyx              | 13 ++++++++-----
 spacy/pipeline/transition_parser.pyx   |  4 ++--
 8 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3891d137867..0a5bc162773 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev2,<9.1.0",
+    "thinc>=9.0.0.dev4,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 37179bc79d0..1a2459498a8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev0,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev2,<9.1.0
+thinc>=9.0.0.dev4,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 39ca2dfa743..745043291cb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,6 +30,14 @@ project_urls =
 zip_safe = false
 include_package_data = true
 python_requires = >=3.8
+setup_requires =
+    cython>=0.25,<3.0
+    numpy>=1.15.0
+    # We also need our Cython packages here to compile against
+    cymem>=2.0.2,<2.1.0
+    preshed>=3.0.2,<3.1.0
+    murmurhash>=0.28.0,<1.1.0
+    thinc>=9.0.0.dev4,<9.1.0
 install_requires =
     # Our libraries
     spacy-legacy>=4.0.0.dev0,<4.1.0
@@ -37,7 +45,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev2,<9.1.0
+    thinc>=9.0.0.dev4,<9.1.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 046ef19c3d5..1a29735e8e8 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -5,7 +5,6 @@
 import numpy as np
 import srsly
 from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
@@ -128,9 +127,7 @@ def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
     ) -> Tuple[float, List[Floats2d]]:
         validate_examples(examples, "EditTreeLemmatizer.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(
-            normalize=False, missing_value=-1
-        )
+        loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
 
         truths = []
         for eg in examples:
@@ -166,7 +163,7 @@ def get_teacher_student_loss(
 
         DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 669a5424412..0f77326e67d 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -2,9 +2,7 @@
 from itertools import islice
 from typing import Callable, Dict, Iterable, Optional, Union
 
-from thinc.api import Config, Model
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 
 from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
@@ -296,7 +294,8 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
+                                                    label_smoothing=self.cfg["label_smoothing"])
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 42615e194e0..51670dcf8cf 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -2,10 +2,7 @@
 from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Union
 
-import srsly
-from thinc.api import Config, Model
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 
 from ..tokens.doc cimport Doc
 
@@ -160,7 +157,7 @@ class SentenceRecognizer(Tagger):
         """
         validate_examples(examples, "SentenceRecognizer.get_loss")
         labels = self.labels
-        loss_func = LegacySequenceCategoricalCrossentropy(names=labels, normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
         truths = []
         for eg in examples:
             eg_truth = []
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index f3d0527ea0b..21c7b3ab0a3 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -4,9 +4,7 @@ from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy
-import srsly
-from thinc.api import Config, Model, set_dropout_rate
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
 from thinc.types import Floats2d, Ints1d
 
 from ..morphology cimport Morphology
@@ -275,7 +273,7 @@ class Tagger(TrainablePipe):
         
         DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))
@@ -292,7 +290,12 @@ class Tagger(TrainablePipe):
         DOCS: https://spacy.io/api/tagger#get_loss
         """
         validate_examples(examples, "Tagger.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
+        loss_func = SequenceCategoricalCrossentropy(
+            names=self.labels,
+            normalize=False,
+            neg_prefix=self.cfg["neg_prefix"],
+            label_smoothing=self.cfg["label_smoothing"]
+        )
         # Convert empty tag "" to missing value None so that both misaligned
         # tokens and tokens with missing annotation have the default missing
         # value None.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index fa9a76772ec..c728f1b7909 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -22,13 +22,13 @@ from thinc.api import (
     CupyOps,
     NumpyOps,
     Optimizer,
+    SequenceCategoricalCrossentropy,
     chain,
     get_ops,
     set_dropout_rate,
     softmax_activation,
     use_ops,
 )
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d
 
 from ..ml.parser_model cimport (
@@ -355,7 +355,7 @@ cdef class Parser(TrainablePipe):
         
         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))

From 70c2068514d89b69fafb45a18b013cb4d9181136 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 17 Jan 2024 09:53:01 +0100
Subject: [PATCH 095/504] Temporily xfail local remote storage test

---
 spacy/tests/test_cli.py | 61 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index a47f03e8ab4..c9e823ffe68 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -878,6 +878,67 @@ def test_applycli_user_data():
         assert result[0]._.ext == val
 
 
+# TODO: remove this xfail after merging master into v4. The issue
+#       is that for local files, pathy started returning os.stat_result,
+#       which doesn't have a last_modified property. So, recency-sorting
+#       fails and the test fails. However, once we merge master into
+#       v4, we'll use weasel, which in turn uses cloudpathlib, which
+#       should resolve this issue.
+@pytest.mark.xfail(reason="Recency sorting is broken on some platforms")
+def test_local_remote_storage():
+    with make_tempdir() as d:
+        filename = "a.txt"
+
+        content_hashes = ("aaaa", "cccc", "bbbb")
+        for i, content_hash in enumerate(content_hashes):
+            # make sure that each subsequent file has a later timestamp
+            if i > 0:
+                time.sleep(1)
+            content = f"{content_hash} content"
+            loc_file = d / "root" / filename
+            if not loc_file.parent.exists():
+                loc_file.parent.mkdir(parents=True)
+            with loc_file.open(mode="w") as file_:
+                file_.write(content)
+
+            # push first version to remote storage
+            remote = RemoteStorage(d / "root", str(d / "remote"))
+            remote.push(filename, "aaaa", content_hash)
+
+            # retrieve with full hashes
+            loc_file.unlink()
+            remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
+            with loc_file.open(mode="r") as file_:
+                assert file_.read() == content
+
+            # retrieve with command hash
+            loc_file.unlink()
+            remote.pull(filename, command_hash="aaaa")
+            with loc_file.open(mode="r") as file_:
+                assert file_.read() == content
+
+            # retrieve with content hash
+            loc_file.unlink()
+            remote.pull(filename, content_hash=content_hash)
+            with loc_file.open(mode="r") as file_:
+                assert file_.read() == content
+
+            # retrieve with no hashes
+            loc_file.unlink()
+            remote.pull(filename)
+            with loc_file.open(mode="r") as file_:
+                assert file_.read() == content
+
+
+def test_local_remote_storage_pull_missing():
+    # pulling from a non-existent remote pulls nothing gracefully
+    with make_tempdir() as d:
+        filename = "a.txt"
+        remote = RemoteStorage(d / "root", str(d / "remote"))
+        assert remote.pull(filename, command_hash="aaaa") is None
+        assert remote.pull(filename) is None
+
+
 def test_cli_find_threshold(capsys):
     def make_examples(nlp: Language) -> List[Example]:
         docs: List[Example] = []

From fb19c534d27d4b480d987efd503ed35997726486 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 30 Oct 2023 17:02:08 +0100
Subject: [PATCH 096/504] Add note in docs on `score_weight` config if using a
 non-default `spans_key` for SpanCat (#13093)

* Add note on score_weight if using a non-default span_key for SpanCat.

* Fix formatting.

* Fix formatting.

* Fix typo.

* Use warning infobox.

* Fix infobox formatting.
---
 website/docs/api/spancategorizer.mdx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx
index 258db794786..33219751ca6 100644
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@@ -75,8 +75,7 @@ architectures and their arguments and hyperparameters.
 <Infobox variant="warning">
 
 If you set a non-default value for `spans_key`, you'll have to update
-`[training.score_weights]` as well so that weights are computed properly. E. g.
-for `spans_key == "myspankey"`, include this in your config:
+`[training.score_weights]` as well so that weights are computed properly. E. g. for `span_key == "myspankey"`, include this in your config:
 
 ```ini
 [training.score_weights]

From a40f819d23cf958d162b065b511f2eb352ebb64b Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 10 Nov 2023 08:05:07 +0100
Subject: [PATCH 097/504] Warn about reloading dependencies after downloading
 models (#13081)

* Update the "Missing factory" error message

This accounts for model installations that took place during the current Python session.

* Add a note about Jupyter notebooks

* Move error to `spacy.cli.download`
Add extra message for Jupyter sessions

* Add additional note for interactive sessions

* Remove note about `spacy-transformers` from error message

* `isort`

* Improve checks for colab (also helps displacy)

* Update warning messages

* Improve flow for multiple checks

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/download.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0635522930b..5e460717cc4 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,10 +7,11 @@
 from wasabi import msg
 
 from .. import about
+from ..errors import OLD_MODEL_SHORTCUTS
 from ..util import (
-    get_installed_models,
     get_minor_version,
-    get_package_version,
+    is_in_interactive,
+    is_in_jupyter,
     is_package,
     is_prerelease_version,
     run_command,

From de480b6baaff827f7444f37fc0a15eed410aab70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 29 Nov 2023 09:11:54 +0100
Subject: [PATCH 098/504] Update `TextCatBOW` to use the fixed `SparseLinear`
 layer (#13149)

* Update `TextCatBOW` to use the fixed `SparseLinear` layer

A while ago, we fixed the `SparseLinear` layer to use all available
parameters: https://github.com/explosion/thinc/pull/754

This change updates `TextCatBOW` to `v3` which uses the new
`SparseLinear_v2` layer. This results in a sizeable improvement on a
text categorization task that was tested.

While at it, this `spacy.TextCatBOW.v3` also adds the `length_exponent`
option to make it possible to change the hidden size. Ideally, we'd just
have an option called `length`. But the way that `TextCatBOW` uses
hashes results in a non-uniform distribution of parameters when the
length is not a power of two.

* Replace TexCatBOW `length_exponent` parameter by `length`

We now round up the length to the next power of two if it isn't
a power of two.

* Remove some tests for TextCatBOW.v2

* Fix missing import
---
 spacy/errors.py                      |  3 ---
 spacy/tests/pipeline/test_textcat.py |  8 +++---
 website/docs/api/architectures.mdx   | 40 ++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 1af8a3b0891..571335009be 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -974,9 +974,6 @@ class Errors(metaclass=ErrorsWithCodes):
     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
              "but only callbacks with one or three parameters are supported")
     E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
-    E1057 = ("The `TextCatReduce` architecture must be used with at least one "
-             "reduction. Please enable one of `use_reduce_first`, "
-             "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 3f2d757eebc..3653739befd 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -499,9 +499,9 @@ def test_resize(name, textcat_config):
         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
-        # REDUCE
-        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        # CNN
+        ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
     ],
 )
 # fmt: on
@@ -749,7 +749,7 @@ def test_overfitting_IO_multi():
         # ENSEMBLE V2
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
-        # CNN V2 (legacy)
+        # CNN V2
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
         # PARAMETRIC ATTENTION V1
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 956234ac0d4..31beb15644c 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -1020,6 +1020,46 @@ but used an internal `tok2vec` instead of taking it as argument:
 
 ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
 
+> #### Example Config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.TextCatCNN.v2"
+> exclusive_classes = false
+> nO = null
+>
+> [model.tok2vec]
+> @architectures = "spacy.HashEmbedCNN.v2"
+> pretrained_vectors = null
+> width = 96
+> depth = 4
+> embed_size = 2000
+> window_size = 1
+> maxout_pieces = 3
+> subword_features = true
+> ```
+
+A neural network model where token vectors are calculated using a CNN. The
+vectors are mean pooled and used as features in a feed-forward network. This
+architecture is usually less accurate than the ensemble, but runs faster.
+
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
+
+<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
+
+[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
+not yet resizable. Since v2, new labels can be added to this component, even
+after training.
+
+</Accordion>
+
+### spacy.TextCatBOW.v3 {id="TextCatBOW"}
+
 > #### Example Config
 >
 > ```ini

From 845ce570bf55de2728409be859bbd689476bb41b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 11:00:06 +0100
Subject: [PATCH 099/504] Add TextCatReduce.v1 (#13181)

* Add TextCatReduce.v1

This is a textcat classifier that pools the vectors generated by a
tok2vec implementation and then applies a classifier to the pooled
representation. Three reductions are supported for pooling: first, max,
and mean. When multiple reductions are enabled, the reductions are
concatenated before providing them to the classification layer.

This model is a generalization of the TextCatCNN model, which only
supports mean reductions and is a bit of a misnomer, because it can also
be used with transformers. This change also reimplements TextCatCNN.v2
using the new TextCatReduce.v1 layer.

* Doc fixes

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fully specify `TextCatCNN` <-> `TextCatReduce` equivalence

* Move TextCatCNN docs to legacy, in prep for moving to spacy-legacy

* Add back a test for TextCatCNN.v2

* Replace TextCatCNN in pipe configurations and templates

* Add an infobox to the `TextCatReduce` section with an `TextCatCNN` anchor

* Add last reduction (`use_reduce_last`)

* Remove non-working TextCatCNN Netlify redirect

* Revert layer changes for the quickstart

* Revert one more quickstart change

* Remove unused import

* Fix docstring

* Fix setting name in error message

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/errors.py                      |  3 +
 spacy/ml/models/textcat.py           | 85 ++++------------------------
 spacy/tests/pipeline/test_textcat.py | 13 ++---
 website/docs/api/architectures.mdx   | 78 -------------------------
 4 files changed, 21 insertions(+), 158 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 571335009be..1af8a3b0891 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -974,6 +974,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
              "but only callbacks with one or three parameters are supported")
     E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
+    E1057 = ("The `TextCatReduce` architecture must be used with at least one "
+             "reduction. Please enable one of `use_reduce_first`, "
+             "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 601c94a7f0a..1a49bac1d9d 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -22,6 +22,9 @@
     reduce_first,
     reduce_last,
     reduce_max,
+    reduce_first,
+    reduce_last,
+    reduce_max,
     reduce_mean,
     reduce_sum,
     residual,
@@ -63,6 +66,15 @@ def build_simple_cnn_text_classifier(
         use_reduce_mean=True,
         nO=nO,
     )
+    return build_reduce_text_classifier(
+        tok2vec=tok2vec,
+        exclusive_classes=exclusive_classes,
+        use_reduce_first=False,
+        use_reduce_last=False,
+        use_reduce_max=False,
+        use_reduce_mean=True,
+        nO=nO,
+    )
 
 
 def resize_and_set_ref(model, new_nO, resizable_layer):
@@ -221,79 +233,6 @@ def build_text_classifier_lowdata(
     return model
 
 
-@registry.architectures("spacy.TextCatParametricAttention.v1")
-def build_textcat_parametric_attention_v1(
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    exclusive_classes: bool,
-    nO: Optional[int] = None,
-) -> Model[List[Doc], Floats2d]:
-    width = tok2vec.maybe_get_dim("nO")
-    parametric_attention = _build_parametric_attention_with_residual_nonlinear(
-        tok2vec=tok2vec,
-        nonlinear_layer=Maxout(nI=width, nO=width),
-        key_transform=Gelu(nI=width, nO=width),
-    )
-    with Model.define_operators({">>": chain}):
-        if exclusive_classes:
-            output_layer = Softmax(nO=nO)
-        else:
-            output_layer = Linear(nO=nO) >> Logistic()
-        model = parametric_attention >> output_layer
-    if model.has_dim("nO") is not False and nO is not None:
-        model.set_dim("nO", cast(int, nO))
-    model.set_ref("output_layer", output_layer)
-    model.attrs["multi_label"] = not exclusive_classes
-
-    return model
-
-
-def _build_parametric_attention_with_residual_nonlinear(
-    *,
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    nonlinear_layer: Model[Floats2d, Floats2d],
-    key_transform: Optional[Model[Floats2d, Floats2d]] = None,
-) -> Model[List[Doc], Floats2d]:
-    with Model.define_operators({">>": chain, "|": concatenate}):
-        width = tok2vec.maybe_get_dim("nO")
-        attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform)
-        norm_layer = LayerNorm(nI=width)
-        parametric_attention = (
-            tok2vec
-            >> list2ragged()
-            >> attention_layer
-            >> reduce_sum()
-            >> residual(nonlinear_layer >> norm_layer >> Dropout(0.0))
-        )
-
-        parametric_attention.init = _init_parametric_attention_with_residual_nonlinear
-
-        parametric_attention.set_ref("tok2vec", tok2vec)
-        parametric_attention.set_ref("attention_layer", attention_layer)
-        parametric_attention.set_ref("key_transform", key_transform)
-        parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
-        parametric_attention.set_ref("norm_layer", norm_layer)
-
-        return parametric_attention
-
-
-def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
-    # When tok2vec is lazily initialized, we need to initialize it before
-    # the rest of the chain to ensure that we can get its width.
-    tok2vec = model.get_ref("tok2vec")
-    tok2vec.initialize(X)
-
-    tok2vec_width = get_tok2vec_width(model)
-    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("key_transform").set_dim("nI", tok2vec_width)
-    model.get_ref("key_transform").set_dim("nO", tok2vec_width)
-    model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
-    init_chain(model, X, Y)
-    return model
-
-
 @registry.architectures("spacy.TextCatReduce.v1")
 def build_reduce_text_classifier(
     tok2vec: Model,
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 3653739befd..9ee93af0fef 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -473,6 +473,8 @@ def test_no_resize(name, textcat_config):
         # CNN
         ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
     ],
 )
 # fmt: on
@@ -499,9 +501,9 @@ def test_resize(name, textcat_config):
         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
-        # CNN
-        ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
+        # REDUCE
+        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
     ],
 )
 # fmt: on
@@ -749,12 +751,9 @@ def test_overfitting_IO_multi():
         # ENSEMBLE V2
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
-        # CNN V2
+        # CNN V2 (legacy)
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
-        # PARAMETRIC ATTENTION V1
-        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
-        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
         # REDUCE V1
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 31beb15644c..63f723a28cf 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -1020,46 +1020,6 @@ but used an internal `tok2vec` instead of taking it as argument:
 
 ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
 
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.TextCatCNN.v2"
-> exclusive_classes = false
-> nO = null
->
-> [model.tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v2"
-> pretrained_vectors = null
-> width = 96
-> depth = 4
-> embed_size = 2000
-> window_size = 1
-> maxout_pieces = 3
-> subword_features = true
-> ```
-
-A neural network model where token vectors are calculated using a CNN. The
-vectors are mean pooled and used as features in a feed-forward network. This
-architecture is usually less accurate than the ensemble, but runs faster.
-
-| Name                | Description                                                                                                                                                                                    |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
-
-<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
-
-[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
-not yet resizable. Since v2, new labels can be added to this component, even
-after training.
-
-</Accordion>
-
-### spacy.TextCatBOW.v3 {id="TextCatBOW"}
-
 > #### Example Config
 >
 > ```ini
@@ -1096,44 +1056,6 @@ the others, but may not be as accurate, especially if texts are short.
 
 </Accordion>
 
-### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"}
-
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.TextCatParametricAttention.v1"
-> exclusive_classes = true
-> nO = null
->
-> [model.tok2vec]
-> @architectures = "spacy.Tok2Vec.v2"
->
-> [model.tok2vec.embed]
-> @architectures = "spacy.MultiHashEmbed.v2"
-> width = 64
-> rows = [2000, 2000, 1000, 1000, 1000, 1000]
-> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
-> include_static_vectors = false
->
-> [model.tok2vec.encode]
-> @architectures = "spacy.MaxoutWindowEncoder.v2"
-> width = ${model.tok2vec.embed.width}
-> window_size = 1
-> maxout_pieces = 3
-> depth = 2
-> ```
-
-A neural network model that is built upon Tok2Vec and uses parametric attention
-to attend to tokens that are relevant to text classification.
-
-| Name                | Description                                                                                                                                                                                    |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`           | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
-
 ### spacy.TextCatReduce.v1 {id="TextCatReduce"}
 
 > #### Example Config

From 785915c58c1ab05046133c9a1e19e74ed8ebc8fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 2 Jan 2024 10:03:06 +0100
Subject: [PATCH 100/504] Add spacy.TextCatParametricAttention.v1 (#13201)

* Add spacy.TextCatParametricAttention.v1

This layer provides is a simplification of the ensemble classifier that
only uses paramteric attention. We have found empirically that with a
sufficient amount of training data, using the ensemble classifier with
BoW does not provide significant improvement in classifier accuracy.
However, plugging in a BoW classifier does reduce GPU training and
inference performance substantially, since it uses a GPU-only kernel.

* Fix merge fallout
---
 pyproject.toml                       |  5 ++-
 requirements.txt                     |  2 +-
 setup.cfg                            |  4 +-
 spacy/ml/models/textcat.py           | 65 ++++++++++++++++++++++++++++
 spacy/tests/pipeline/test_textcat.py |  3 ++
 website/docs/api/architectures.mdx   | 38 ++++++++++++++++
 6 files changed, 112 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0a5bc162773..bfd7e68d1f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,8 +5,9 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev4,<9.1.0",
-    "numpy>=1.15.0",
+    "thinc>=8.2.2,<8.3.0",
+    "numpy>=1.15.0; python_version < '3.9'",
+    "numpy>=1.25.0; python_version >= '3.9'",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index 1a2459498a8..553c5fec20b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev0,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev4,<9.1.0
+thinc>=8.2.2,<8.3.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 745043291cb..b521d2daf90 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,7 +37,7 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=9.0.0.dev4,<9.1.0
+    thinc>=8.2.2,<8.3.0
 install_requires =
     # Our libraries
     spacy-legacy>=4.0.0.dev0,<4.1.0
@@ -45,7 +45,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev4,<9.1.0
+    thinc>=8.2.2,<8.3.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 1a49bac1d9d..4b3d2de9171 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -233,6 +233,71 @@ def build_text_classifier_lowdata(
     return model
 
 
+@registry.architectures("spacy.TextCatParametricAttention.v1")
+def build_textcat_parametric_attention_v1(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    exclusive_classes: bool,
+    nO: Optional[int] = None,
+) -> Model[List[Doc], Floats2d]:
+    width = tok2vec.maybe_get_dim("nO")
+    parametric_attention = _build_parametric_attention_with_residual_nonlinear(
+        tok2vec=tok2vec,
+        nonlinear_layer=Maxout(nI=width, nO=width),
+        key_transform=Gelu(nI=width, nO=width),
+    )
+    with Model.define_operators({">>": chain}):
+        if exclusive_classes:
+            output_layer = Softmax(nO=nO)
+        else:
+            output_layer = Linear(nO=nO) >> Logistic()
+        model = parametric_attention >> output_layer
+    if model.has_dim("nO") is not False and nO is not None:
+        model.set_dim("nO", cast(int, nO))
+    model.set_ref("output_layer", output_layer)
+    model.attrs["multi_label"] = not exclusive_classes
+
+    return model
+
+
+def _build_parametric_attention_with_residual_nonlinear(
+    *,
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    nonlinear_layer: Model[Floats2d, Floats2d],
+    key_transform: Optional[Model[Floats2d, Floats2d]] = None,
+) -> Model[List[Doc], Floats2d]:
+    with Model.define_operators({">>": chain, "|": concatenate}):
+        width = tok2vec.maybe_get_dim("nO")
+        attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform)
+        norm_layer = LayerNorm(nI=width)
+        parametric_attention = (
+            tok2vec
+            >> list2ragged()
+            >> attention_layer
+            >> reduce_sum()
+            >> residual(nonlinear_layer >> norm_layer >> Dropout(0.0))
+        )
+
+        parametric_attention.init = _init_parametric_attention_with_residual_nonlinear
+
+        parametric_attention.set_ref("tok2vec", tok2vec)
+        parametric_attention.set_ref("attention_layer", attention_layer)
+        parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
+        parametric_attention.set_ref("norm_layer", norm_layer)
+
+        return parametric_attention
+
+
+def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
+    tok2vec_width = get_tok2vec_width(model)
+    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
+    model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
+    model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
+    model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
+    model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
+    init_chain(model, X, Y)
+    return model
+
+
 @registry.architectures("spacy.TextCatReduce.v1")
 def build_reduce_text_classifier(
     tok2vec: Model,
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 9ee93af0fef..2bba40d1d13 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -754,6 +754,9 @@ def test_overfitting_IO_multi():
         # CNN V2 (legacy)
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
+        # PARAMETRIC ATTENTION V1
+        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
+        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
         # REDUCE V1
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 63f723a28cf..956234ac0d4 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -1056,6 +1056,44 @@ the others, but may not be as accurate, especially if texts are short.
 
 </Accordion>
 
+### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"}
+
+> #### Example Config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.TextCatParametricAttention.v1"
+> exclusive_classes = true
+> nO = null
+>
+> [model.tok2vec]
+> @architectures = "spacy.Tok2Vec.v2"
+>
+> [model.tok2vec.embed]
+> @architectures = "spacy.MultiHashEmbed.v2"
+> width = 64
+> rows = [2000, 2000, 1000, 1000, 1000, 1000]
+> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+> include_static_vectors = false
+>
+> [model.tok2vec.encode]
+> @architectures = "spacy.MaxoutWindowEncoder.v2"
+> width = ${model.tok2vec.embed.width}
+> window_size = 1
+> maxout_pieces = 3
+> depth = 2
+> ```
+
+A neural network model that is built upon Tok2Vec and uses parametric attention
+to attend to tokens that are relevant to text classification.
+
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`           | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
+
 ### spacy.TextCatReduce.v1 {id="TextCatReduce"}
 
 > #### Example Config

From b83102bfc7995f21f83a1da0e73bfdc3506badf5 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 4 Dec 2023 15:23:28 +0100
Subject: [PATCH 101/504] Add documentation for EL task (#12988)

* Add documentation for EL task.

* Fix EL factory name.

* Add llm_entity_linker_mentio.

* Apply suggestions from code review

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Incorporate feedback.

* Format.

* Fix link to KB data.

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 website/docs/api/large-language-models.mdx | 172 ++++++++++++++++++++-
 1 file changed, 169 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/large-language-models.mdx b/website/docs/api/large-language-models.mdx
index cefd5c66ee1..583aa25a44d 100644
--- a/website/docs/api/large-language-models.mdx
+++ b/website/docs/api/large-language-models.mdx
@@ -20,9 +20,10 @@ An LLM component is implemented through the `LLMWrapper` class. It is accessible
 through a generic `llm`
 [component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories)
 as well as through task-specific component factories: `llm_ner`, `llm_spancat`,
-`llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization`,
-`llm_entity_linker`, `llm_raw` and `llm_translation`. For these factories, the
-GPT-3-5 model from OpenAI is used by default, but this can be customized.
+`llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization` and
+`llm_entity_linker`.
+
+### LLMWrapper.\_\_init\_\_ {id="init",tag="method"}
 
 > #### Example
 >
@@ -687,6 +688,171 @@ for a toy example of how such a KB file might look like.
 | -------- | ------------------------------------- |
 | `path`   | Path to KB file. ~~Union[str, Path]~~ |
 
+### EL (Entity Linking) {id="nel"}
+
+The EL links recognized entities (see [NER](#ner)) to those in a knowledge base
+(KB). The EL task prompts the LLM to select the most likely candidate from the
+KB, whose structure can be arbitrary.
+
+Note that the documents processed by the entity linking task are expected to
+have recognized entities in their `.ents` attribute. This can be achieved by
+either running the [NER task](#ner), using a trained spaCy NER model or setting
+the entities manually prior to running the EL task.
+
+In order to be able to pull data from the KB, an object implementing the
+`CandidateSelector` protocol has to be provided. This requires two functions:
+(1) `__call__()` to fetch candidate entities for entity mentions in the text
+(assumed to be available in `Doc.ents`) and (2) `get_entity_description()` to
+fetch descriptions for any given entity ID. Descriptions can be empty, but
+ideally provide more context for entities stored in the KB.
+
+`spacy-llm` provides a `CandidateSelector` implementation
+(`spacy.CandidateSelector.v1`) that leverages a spaCy knowledge base - as used
+in an `entity_linking` component - to select candidates. This knowledge base can
+be loaded from an existing spaCy pipeline (note that the pipeline's EL component
+doesn't have to be trained) or from a separate .yaml file.
+
+#### spacy.EntityLinker.v1 {id="el-v1"}
+
+Supports zero- and few-shot prompting. Relies on a configurable component
+suggesting viable entities before letting the LLM pick the most likely
+candidate.
+
+> #### Example config for spacy.EntityLinker.v1
+>
+> ```ini
+> [paths]
+> el_nlp = null
+>
+> ...
+>
+> [components.llm.task]
+> @llm_tasks = "spacy.EntityLinker.v1"
+>
+> [initialize]
+> [initialize.components]
+> [initialize.components.llm]
+> [initialize.components.llm.candidate_selector]
+> @llm_misc = "spacy.CandidateSelector.v1"
+>
+> # Load a KB from a KB file. For loading KBs from spaCy pipelines see spacy.KBObjectLoader.v1.
+> [initialize.components.llm.candidate_selector.kb_loader]
+> @llm_misc = "spacy.KBFileLoader.v1"
+> # Path to knowledge base .yaml file.
+> path = ${paths.el_kb}
+> ```
+
+| Argument              | Description                                                                                                                                                                                   |
+| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `template`            | Custom prompt template to send to LLM model. Defaults to [entity_linker.v1.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/entity_linker.v1.jinja). ~~str~~ |
+| `parse_responses`     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[EntityLinkerTask]]~~                                   |
+| `prompt_example_type` | Type to use for fewshot examples. Defaults to `ELExample`. ~~Optional[Type[FewshotExample]]~~                                                                                                 |
+| `examples`            | Optional callable that reads a file containing task examples for few-shot learning. If `None` is passed, zero-shot learning will be used. Defaults to `None`. ~~ExamplesConfigType~~          |
+| `scorer`              | Scorer function. Defaults to the metric used by spaCy to evaluate entity linking performance. ~~Optional[Scorer]~~                                                                            |
+
+##### spacy.CandidateSelector.v1 {id="candidate-selector-v1"}
+
+`spacy.CandidateSelector.v1` is an implementation of the `CandidateSelector`
+protocol required by [`spacy.EntityLinker.v1`](#el-v1). The built-in candidate
+selector method allows loading existing knowledge bases in several ways, e. g.
+loading from a spaCy pipeline with a (not necessarily trained) entity linking
+component, and loading from a file describing the knowlege base as a .yaml file.
+Either way the loaded data will be converted to a spaCy `InMemoryLookupKB`
+instance. The KB's selection capabilities are used to select the most likely
+entity candidates for the specified mentions.
+
+> #### Example config for spacy.CandidateSelector.v1
+>
+> ```ini
+> [initialize]
+> [initialize.components]
+> [initialize.components.llm]
+> [initialize.components.llm.candidate_selector]
+> @llm_misc = "spacy.CandidateSelector.v1"
+>
+> # Load a KB from a KB file. For loading KBs from spaCy pipelines see spacy.KBObjectLoader.v1.
+> [initialize.components.llm.candidate_selector.kb_loader]
+> @llm_misc = "spacy.KBFileLoader.v1"
+> # Path to knowledge base .yaml file.
+> path = ${paths.el_kb}
+> ```
+
+| Argument    | Description                                                       |
+| ----------- | ----------------------------------------------------------------- |
+| `kb_loader` | KB loader object. ~~InMemoryLookupKBLoader~~                      |
+| `top_n`     | Top-n candidates to include in the prompt. Defaults to 5. ~~int~~ |
+
+##### spacy.KBObjectLoader.v1 {id="kb-object-loader-v1"}
+
+Adheres to the `InMemoryLookupKBLoader` interface required by
+[`spacy.CandidateSelector.v1`](#candidate-selector-v1). Loads a knowledge base
+from an existing spaCy pipeline.
+
+> #### Example config for spacy.KBObjectLoader.v1
+>
+> ```ini
+> [initialize.components.llm.candidate_selector.kb_loader]
+> @llm_misc = "spacy.KBObjectLoader.v1"
+> # Path to knowledge base directory in serialized spaCy pipeline.
+> path = ${paths.el_kb}
+> # Path to spaCy pipeline. If this is not specified, spacy-llm tries to determine this automatically (but may fail).
+> nlp_path = ${paths.el_nlp}
+> # Path to file with descriptions for entity.
+> desc_path = ${paths.el_desc}
+> ```
+
+| Argument          | Description                                                                                                                                                                                                                         |
+| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path`            | Path to KB file. ~~Union[str, Path]~~                                                                                                                                                                                               |
+| `nlp_path`        | Path to serialized NLP pipeline. If None, path will be guessed. ~~Optional[Union[Path, str]]~~                                                                                                                                      |
+| `desc_path`       | Path to file with descriptions for entities. ~~int~~                                                                                                                                                                                |
+| `ent_desc_reader` | Entity description reader. Defaults to an internal method expecting a CSV file without header row, with ";" as delimiters, and with two columns - one for the entitys' IDs, one for their descriptions. ~~Optional[EntDescReader]~~ |
+
+##### spacy.KBFileLoader.v1 {id="kb-file-loader-v1"}
+
+Adheres to the `InMemoryLookupKBLoader` interface required by
+[`spacy.CandidateSelector.v1`](#candidate-selector-v1). Loads a knowledge base
+from a knowledge base file. The KB .yaml file has to stick to the following
+format:
+
+```yaml
+entities:
+  # The key should be whatever ID identifies this entity uniquely in your knowledge base.
+  ID1:
+      name: "..."
+      desc: "..."
+  ID2:
+      ...
+# Data on aliases in your knowledge base - e. g. "Apple" for the entity "Apple Inc.".
+aliases:
+  - alias: "..."
+    # List of all entities that this alias refers to.
+    entities: ["ID1", "ID2", ...]
+    # Optional: prior probabilities that this alias refers to the n-th entity in the "entities" attribute.
+    probabilities: [0.5, 0.2, ...]
+  - alias: "..."
+    entities: [...]
+    probabilities: [...]
+  ...
+```
+
+See
+[here](https://github.com/explosion/spacy-llm/blob/main/usage_examples/el_openai/el_kb_data.yml)
+for a toy example of how such a KB file might look like.
+
+> #### Example config for spacy.KBFileLoader.v1
+>
+> ```ini
+> [initialize.components.llm.candidate_selector.kb_loader]
+> @llm_misc = "spacy.KBFileLoader.v1"
+> # Path to knowledge base file.
+> path = ${paths.el_kb}
+> ```
+
+| Argument | Description                           |
+| -------- | ------------------------------------- |
+| `path`   | Path to KB file. ~~Union[str, Path]~~ |
+
 ### NER {id="ner"}
 
 The NER task identifies non-overlapping entities in text.

From b110efb90227478fb554c9e767f6e367d55acff8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 10:28:46 +0100
Subject: [PATCH 102/504] Typing fixes

---
 requirements.txt           |  2 +-
 spacy/tokens/span.pyi      |  2 ++
 spacy/training/example.pyi |  4 ++++
 spacy/training/example.pyx |  6 ++++++
 spacy/training/loop.py     | 26 ++++++++++++++------------
 5 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 553c5fec20b..94d3afb8f70 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,7 +31,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<1.1.0; platform_machine != "aarch64"
+mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 2a529593e5f..f1030278c69 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -139,6 +139,8 @@ class Span:
     def lemma_(self) -> str: ...
     @property
     def label_(self) -> str: ...
+    @label_.setter
+    def label_(self, label: str): ...
     @property
     def kb_id_(self) -> str: ...
     @property
diff --git a/spacy/training/example.pyi b/spacy/training/example.pyi
index 06639d70c06..33cf07b0902 100644
--- a/spacy/training/example.pyi
+++ b/spacy/training/example.pyi
@@ -9,6 +9,10 @@ def annotations_to_doc(
     tok_annot: Dict[str, Any],
     doc_annot: Dict[str, Any],
 ) -> Doc: ...
+def validate_distillation_examples(
+    examples: Iterable[Example],
+    method: str,
+) -> None: ...
 def validate_examples(
     examples: Iterable[Example],
     method: str,
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index efca4bcb03b..bc6852f83c6 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -57,6 +57,12 @@ def validate_examples(examples, method):
 
 
 def validate_distillation_examples(examples, method):
+    """Check that a batch of examples received during processing is valid
+    for distillation.
+
+    examples (Iterable[Examples]): A batch of examples.
+    method (str): The method name to show in error messages.
+    """
     validate_examples(examples, method)
     for eg in examples:
         if [token.text for token in eg.reference] != [token.text for token in eg.predicted]:
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 63715ec2c42..575a583b78c 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -12,7 +12,9 @@
     Iterable,
     List,
     Optional,
+    Sized,
     Tuple,
+    TypeVar,
     Union,
 )
 
@@ -22,7 +24,6 @@
 from .. import ty
 from ..errors import Errors
 from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
-from ..tokens.doc import Doc
 from ..util import (
     logger,
     registry,
@@ -282,7 +283,7 @@ def _distill_loop(
     teacher: "Language",
     student: "Language",
     optimizer: Optimizer,
-    distill_data: Iterable[List[Example]],
+    distill_data: Iterable[Tuple[int, List[Example]]],
     evaluate: Callable[[], Tuple[float, Dict[str, float]]],
     *,
     dropout: float,
@@ -401,7 +402,7 @@ def _distill_loop(
 def train_while_improving(
     nlp: "Language",
     optimizer: Optimizer,
-    train_data: Iterable[List[Example]],
+    train_data: Iterable[Tuple[int, List[Example]]],
     evaluate: Callable[[], Tuple[float, Dict[str, float]]],
     *,
     dropout: float,
@@ -520,15 +521,16 @@ def train_while_improving(
             break
 
 
+ItemT = TypeVar("ItemT", bound=Sized)
+
+
 def subdivide_batch(
-    batch: Union[Iterable[Doc], Iterable[Example]], accumulate_gradient: int
-):
+    batch: Iterable[ItemT], accumulate_gradient: int
+) -> Iterable[List[ItemT]]:
     batch = list(batch)
     if len(batch):
-        if isinstance(batch[0], Example):
-            batch.sort(key=lambda eg: len(eg.predicted))
-        else:
-            batch.sort(key=lambda doc: len(doc))
+        # Examples are sorted by their predicted length.
+        batch.sort(key=lambda item: len(item))
     sub_len = len(batch) // accumulate_gradient
     start = 0
     for i in range(accumulate_gradient):
@@ -578,7 +580,7 @@ def create_distill_batches(
     corpus: Callable[["Language"], Iterable[Example]],
     batcher: Callable[[Iterable[Example]], Iterable[List[Example]]],
     max_epochs: int,
-):
+) -> Iterable[Tuple[int, List[Example]]]:
     """Create distillation batches. In contrast to training, the corpus
     is normally too large to load into memory and shuffle."""
     epoch = 0
@@ -592,9 +594,9 @@ def create_distill_batches(
 def create_train_batches(
     nlp: "Language",
     corpus: Callable[["Language"], Iterable[Example]],
-    batcher: Callable[[Iterable[Example]], Iterable[Example]],
+    batcher: Callable[[Iterable[Example]], Iterable[List[Example]]],
     max_epochs: int,
-):
+) -> Iterable[Tuple[int, List[Example]]]:
     epoch = 0
     if max_epochs >= 0:
         examples = list(corpus(nlp))  # type: Iterable[Example]

From 11ec468a7b5bce9329e640c9d254d352814d8d4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 12:20:09 +0100
Subject: [PATCH 103/504] Py_UNICODE is not compatible with 3.12

---
 spacy/pipeline/_parser_internals/search.pyx |  2 +-
 spacy/tests/parser/_search.pyx              | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
index 578299b56ae..52d5cdaa891 100644
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -1,4 +1,4 @@
-# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
+# cython: experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
 from cymem.cymem cimport Pool
 from libc.math cimport exp
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
index cd9e6b2f5ee..ca2a2916094 100644
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@@ -12,7 +12,7 @@ from ..conftest import cytest
 cdef struct TestState:
     int length
     int x
-    Py_UNICODE* string
+    char *string
 
 
 cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1:
@@ -22,7 +22,7 @@ cdef int transition(void* dest, void* src, class_t clas, void* extra_args) excep
     dest_state.x = src_state.x
     dest_state.x += clas
     if extra_args != NULL:
-        dest_state.string = <Py_UNICODE*>extra_args
+        dest_state.string = <char *>extra_args
     else:
         dest_state.string = src_state.string
 
@@ -32,9 +32,9 @@ cdef void* initialize(Pool mem, int n, void* extra_args) except NULL:
     state.length = n
     state.x = 1
     if extra_args == NULL:
-        state.string = u'default'
+        state.string = 'default'
     else:
-        state.string = <Py_UNICODE*>extra_args
+        state.string = <char *>extra_args
     return state
 
 
@@ -77,7 +77,7 @@ def test_initialize(nr_class, beam_width, length):
     for i in range(b.width):
         s = <TestState*>b.at(i)
         assert s.length == length, s.length
-        assert s.string == 'default'
+        assert s.string.decode('utf8') == 'default'
 
 
 @cytest
@@ -88,11 +88,12 @@ def test_initialize(nr_class, beam_width, length):
                          ]
                          )
 def test_initialize_extra(nr_class, beam_width, length, extra):
+    extra = extra.encode("utf-8") if extra is not None else None
     b = Beam(nr_class, beam_width)
     if extra is None:
         b.initialize(initialize, destroy, length, NULL)
     else:
-        b.initialize(initialize, destroy, length, <void*><Py_UNICODE*>extra)
+        b.initialize(initialize, destroy, length, <void*><char*>extra)
     for i in range(b.width):
         s = <TestState*>b.at(i)
         assert s.length == length

From bc38472fcc7a7b92b6172f3be111b1a033934ff5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 14:59:01 +0100
Subject: [PATCH 104/504] Construct TextCatEnsemble.v2 using helper function

---
 spacy/ml/models/textcat.py | 44 +++++++-------------------------------
 1 file changed, 8 insertions(+), 36 deletions(-)

diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 4b3d2de9171..19ae2579984 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -19,6 +19,7 @@
     clone,
     concatenate,
     list2ragged,
+    noop,
     reduce_first,
     reduce_last,
     reduce_max,
@@ -160,55 +161,26 @@ def build_text_classifier_v2(
     linear_model: Model[List[Doc], Floats2d],
     nO: Optional[int] = None,
 ) -> Model[List[Doc], Floats2d]:
-    # TODO: build the model with _build_parametric_attention_with_residual_nonlinear
-    # in spaCy v4. We don't do this in spaCy v3 to preserve model
-    # compatibility.
+    width = tok2vec.maybe_get_dim("nO")
     exclusive_classes = not linear_model.attrs["multi_label"]
+    parametric_attention = _build_parametric_attention_with_residual_nonlinear(
+        tok2vec=tok2vec,
+        nonlinear_layer=Maxout(nI=width, nO=width),
+        key_transform=noop(),
+    )
     with Model.define_operators({">>": chain, "|": concatenate}):
-        width = tok2vec.maybe_get_dim("nO")
-        attention_layer = ParametricAttention(width)
-        maxout_layer = Maxout(nO=width, nI=width)
-        norm_layer = LayerNorm(nI=width)
-        cnn_model = (
-            tok2vec
-            >> list2ragged()
-            >> attention_layer
-            >> reduce_sum()
-            >> residual(maxout_layer >> norm_layer >> Dropout(0.0))
-        )
-
         nO_double = nO * 2 if nO else None
         if exclusive_classes:
             output_layer = Softmax(nO=nO, nI=nO_double)
         else:
             output_layer = Linear(nO=nO, nI=nO_double) >> Logistic()
-        model = (linear_model | cnn_model) >> output_layer
+        model = (linear_model | parametric_attention) >> output_layer
         model.set_ref("tok2vec", tok2vec)
     if model.has_dim("nO") is not False and nO is not None:
         model.set_dim("nO", cast(int, nO))
     model.set_ref("output_layer", linear_model.get_ref("output_layer"))
-    model.set_ref("attention_layer", attention_layer)
-    model.set_ref("maxout_layer", maxout_layer)
-    model.set_ref("norm_layer", norm_layer)
     model.attrs["multi_label"] = not exclusive_classes
 
-    model.init = init_ensemble_textcat  # type: ignore[assignment]
-    return model
-
-
-def init_ensemble_textcat(model, X, Y) -> Model:
-    # When tok2vec is lazily initialized, we need to initialize it before
-    # the rest of the chain to ensure that we can get its width.
-    tok2vec = model.get_ref("tok2vec")
-    tok2vec.initialize(X)
-
-    tok2vec_width = get_tok2vec_width(model)
-    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("maxout_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("maxout_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
-    init_chain(model, X, Y)
     return model
 
 

From 1f656222986fb804538a098a3791aac735c4c44c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 15:02:02 +0100
Subject: [PATCH 105/504] Remove `setup_requires` from `setup.cfg`

---
 setup.cfg | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index b521d2daf90..8665113a6a6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,14 +30,6 @@ project_urls =
 zip_safe = false
 include_package_data = true
 python_requires = >=3.8
-setup_requires =
-    cython>=0.25,<3.0
-    numpy>=1.15.0
-    # We also need our Cython packages here to compile against
-    cymem>=2.0.2,<2.1.0
-    preshed>=3.0.2,<3.1.0
-    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.2.2,<8.3.0
 install_requires =
     # Our libraries
     spacy-legacy>=4.0.0.dev0,<4.1.0

From 68a6825b0b743371dfa283cc7f2bd2a941dacb27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 17:18:49 +0100
Subject: [PATCH 106/504] Fix up requirements test

To account for buil dependencies being removed from `setup.cfg`.
---
 spacy/tests/package/test_requirements.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index a63b1d8b060..86bdc730c19 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -67,26 +67,28 @@ def test_build_dependencies():
                     "{} and {} respectively".format(lib, v, req_v)
                 )
                 setup_keys.add(lib)
-    assert sorted(setup_keys) == sorted(
-        req_dict.keys()
-    )  # if fail: requirements.txt contains a lib not in setup.cfg
 
     # check pyproject.toml and compare the versions of the libs to requirements.txt
     # does not fail when there are missing or additional libs
     toml_file = root_dir / "pyproject.toml"
     with toml_file.open() as f:
         lines = f.readlines()
+    pyproject_keys = set()
     for line in lines:
         line = line.strip().strip(",").strip('"')
         if not line.startswith("#"):
             lib, v = _parse_req(line)
             if lib and lib not in libs_ignore_requirements:
+                pyproject_keys.add(lib)
                 req_v = req_dict.get(lib, None)
                 assert (lib + v) == (lib + req_v), (
                     "{} has different version in pyproject.toml and in requirements.txt: "
                     "{} and {} respectively".format(lib, v, req_v)
                 )
 
+    # if fail: requirements.txt contains a lib not in setup.cfg or pyproject.toml
+    assert set(setup_keys).union(set(pyproject_keys)) == set(req_dict.keys())
+
 
 def _parse_req(line):
     lib = re.match(r"^[a-z0-9\-]*", line).group(0)

From 910423ecf470cedb04a90773593c8efa30ecc35a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 25 Jan 2024 12:54:23 +0100
Subject: [PATCH 107/504] Set version to v4.0.0.dev2 (#13269)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 73f201af5fb..ef80718fee0 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "4.0.0.dev1"
+__version__ = "4.0.0.dev2"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 6e432864f8bba60324a75ba6ae79e89978188f03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 25 Jan 2024 18:24:22 +0100
Subject: [PATCH 108/504] Update `spacy-legacy` dependency to 4.0.0.dev1
 (#13270)

This release is compatible with the parser refactor backout.
---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 94d3afb8f70..d7b9a348064 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=4.0.0.dev0,<4.1.0
+spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
diff --git a/setup.cfg b/setup.cfg
index 8665113a6a6..1ff4b91c524 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -32,7 +32,7 @@ include_package_data = true
 python_requires = >=3.8
 install_requires =
     # Our libraries
-    spacy-legacy>=4.0.0.dev0,<4.1.0
+    spacy-legacy>=4.0.0.dev1,<4.1.0
     spacy-loggers>=1.0.0,<2.0.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0

From c51781bb48a504615efa199d784774f0e4288dd0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 6 Feb 2024 14:14:55 +0100
Subject: [PATCH 109/504] Remove debug data normalization for span analysis
 (#13203)

* Remove debug data normalization for span analysis

As a result of this normalization, `debug data` could show a user tokens
that do not exist in their data.

* Update spacy/cli/debug_data.py

---------

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/cli/debug_data.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 714969be145..7a98e6d563c 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1073,8 +1073,7 @@ def _get_distribution(docs, normalize: bool = True) -> Counter:
     word_counts: Counter = Counter()
     for doc in docs:
         for token in doc:
-            # Normalize the text
-            t = token.text.lower().replace("``", '"').replace("''", '"')
+            t = token.text.lower()
             word_counts[t] += 1
     if normalize:
         total = sum(word_counts.values(), 0.0)

From 3c564f9915bd2d3b9b11bec536d0f7d475202569 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 22 Mar 2024 18:21:20 +0100
Subject: [PATCH 110/504] remove smart_open requirement as it's taken care of
 via Weasel (#13391)

---
 requirements.txt | 1 -
 setup.cfg        | 1 -
 2 files changed, 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d7b9a348064..80d725dc80c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,6 @@ wasabi>=0.9.1,<1.2.0
 srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.10.0
-smart-open>=5.2.1,<7.0.0
 weasel>=0.1.0,<0.4.0
 # Third party dependencies
 numpy>=1.15.0; python_version < "3.9"
diff --git a/setup.cfg b/setup.cfg
index 1ff4b91c524..2c2d6f379fa 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -44,7 +44,6 @@ install_requires =
     weasel>=0.1.0,<0.4.0
     # Third-party dependencies
     typer>=0.3.0,<0.10.0
-    smart-open>=5.2.1,<7.0.0
     tqdm>=4.38.0,<5.0.0
     numpy>=1.15.0; python_version < "3.9"
     numpy>=1.19.0; python_version >= "3.9"

From d5ecf84dbe560171bfb6e5f22344dad469c1d49c Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 25 Mar 2024 10:17:57 +0100
Subject: [PATCH 111/504] Move DocSearch key to env var [ci skip]

---
 website/meta/site.json           | 1 -
 website/next.config.mjs          | 3 +++
 website/src/components/search.js | 5 +++--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/website/meta/site.json b/website/meta/site.json
index f1d318071c4..55fe60ad364 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -23,7 +23,6 @@
     },
     "docSearch": {
         "appId": "Y1LB128RON",
-        "apiKey": "bb601a1daab73e2dc66faf2b79564807",
         "indexName": "spacy"
     },
     "binderUrl": "explosion/spacy-io-binder",
diff --git a/website/next.config.mjs b/website/next.config.mjs
index df3b1d01d09..5e2f8f8c34d 100644
--- a/website/next.config.mjs
+++ b/website/next.config.mjs
@@ -32,6 +32,9 @@ const nextConfig = withPWA(
             ignoreBuildErrors: true,
         },
         images: { unoptimized: true },
+        env: {
+            DOCSEARCH_API_KEY: process.env.DOCSEARCH_API_KEY
+        }
     })
 )
 
diff --git a/website/src/components/search.js b/website/src/components/search.js
index f80d9cd9f00..3211b53c002 100644
--- a/website/src/components/search.js
+++ b/website/src/components/search.js
@@ -1,4 +1,4 @@
-import React, { useEffect, useState } from 'react'
+import React from 'react'
 import PropTypes from 'prop-types'
 import { DocSearch } from '@docsearch/react'
 import '@docsearch/css'
@@ -6,7 +6,8 @@ import '@docsearch/css'
 import siteMetadata from '../../meta/site.json'
 
 export default function Search({ placeholder = 'Search docs' }) {
-    const { apiKey, indexName, appId } = siteMetadata.docSearch
+    const apiKey = process.env.DOCSEARCH_API_KEY
+    const { indexName, appId } = siteMetadata.docSearch
     return (
         <DocSearch appId={appId} indexName={indexName} apiKey={apiKey} placeholder={placeholder} />
     )

From 2567719b37aa0d6a01ed3742ed2fa1d8c96be5d0 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 26 Mar 2024 09:53:07 +0100
Subject: [PATCH 112/504] Renamed main branch back to v4 for now (#13395)

* Update gputests.yml

* Update slowtests.yml
---
 .github/workflows/gputests.yml  | 2 +-
 .github/workflows/slowtests.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml
index c6ea98f76c7..66e0707e0d5 100644
--- a/.github/workflows/gputests.yml
+++ b/.github/workflows/gputests.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        branch: [master, main]
+        branch: [master, v4]
     if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml
index 4a4f0800590..f9fd3e81769 100644
--- a/.github/workflows/slowtests.yml
+++ b/.github/workflows/slowtests.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        branch: [master, main]
+        branch: [master, v4]
     if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:

From 6e73136b948629d6f4ba63e26286ae2a2d7f9aa2 Mon Sep 17 00:00:00 2001
From: Yaseen <9275716+ynx0@users.noreply.github.com>
Date: Tue, 26 Mar 2024 06:15:25 -0500
Subject: [PATCH 113/504] Update code.module.sass to make code title sticky
 (#13379)

---
 website/src/styles/code.module.sass | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/website/src/styles/code.module.sass b/website/src/styles/code.module.sass
index b619c71ccfd..459281b4322 100644
--- a/website/src/styles/code.module.sass
+++ b/website/src/styles/code.module.sass
@@ -109,6 +109,8 @@
     box-shadow: inset 1px 1px 1px rgba(0, 0, 0, 0.25)
     background: var(--color-dark)
     margin: 1.5rem 0 0 2rem
+    position: sticky
+    left: 2rem
 
 .header
     width: 100%

From 4c8863a7171ecd86fdb3a9a9c9b9947dd2147eaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 12 Jan 2022 13:38:52 +0100
Subject: [PATCH 114/504] Span/SpanGroup: wrap SpanC in shared_ptr (#9869)

* Span/SpanGroup: wrap SpanC in shared_ptr

When a Span that was retrieved from a SpanGroup was modified, these
changes were not reflected in the SpanGroup because the underlying
SpanC struct was copied.

This change applies the solution proposed by @nrodnova, to wrap SpanC in
a shared_ptr. This makes a SpanGroup and Spans derived from it share the
same SpanC. So, changes made through a Span are visible in the SpanGroup
as well.

Fixes #9556

* Test that a SpanGroup is modified through its Spans

* SpanGroup.push_back: remove nogil

Modifying std::vector is not thread-safe.

* C++ >= 11 does not allow const T in vector<T>

* Add Span.span_c as a shorthand for Span.c.get

Since this method is cdef'ed, it is only visible from Cython, so we
avoid using raw pointers in Python

Replace existing uses of span.c.get() to use this new method.

* Fix formatting

* Style fix: pointer types

* SpanGroup.to_bytes: reduce number of shared_ptr::get calls

* Mark SpanGroup modification test with issue

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/_parser_internals/ner.pyx |  32 +++---
 spacy/tests/doc/test_span.py             |  23 ++--
 spacy/tokens/span.pxd                    |  11 +-
 spacy/tokens/span.pyx                    | 131 ++++++++++++++---------
 spacy/tokens/span_group.pxd              |   5 +-
 spacy/tokens/span_group.pyx              |  22 ++--
 6 files changed, 132 insertions(+), 92 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index e4312bd2f92..c77b7b50f2d 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -1,4 +1,8 @@
-# cython: profile=False
+import os
+import random
+from libc.stdint cimport int32_t
+from libcpp.memory cimport shared_ptr
+from libcpp.vector cimport vector
 from cymem.cymem cimport Pool
 from libc.stdint cimport int32_t
 
@@ -47,9 +51,7 @@ MOVE_NAMES[OUT] = 'O'
 
 cdef struct GoldNERStateC:
     Transition* ner
-    SpanC* negs
-    int32_t length
-    int32_t nr_neg
+    vector[shared_ptr[SpanC]] negs
 
 
 cdef class BiluoGold:
@@ -82,8 +84,6 @@ cdef GoldNERStateC create_gold_state(
         negs = []
     assert example.x.length > 0
     gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
-    gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
-    gs.nr_neg = len(negs)
     ner_ents, ner_tags = example.get_aligned_ents_and_ner()
     for i, ner_tag in enumerate(ner_tags):
         gs.ner[i] = moves.lookup_transition(ner_tag)
@@ -97,8 +97,8 @@ cdef GoldNERStateC create_gold_state(
     # In order to handle negative samples, we need to maintain the full
     # (start, end, label) triple. If we break it down to the 'isnt B-LOC'
     # thing, we'll get blocked if there's an incorrect prefix.
-    for i, neg in enumerate(negs):
-        gs.negs[i] = neg.c
+    for neg in negs:
+        gs.negs.push_back(neg.c)
     return gs
 
 
@@ -413,6 +413,8 @@ cdef class Begin:
         cdef int g_act = gold.ner[b0].move
         cdef attr_t g_tag = gold.ner[b0].label
 
+        cdef shared_ptr[SpanC] span
+
         if g_act == MISSING:
             pass
         elif g_act == BEGIN:
@@ -430,8 +432,8 @@ cdef class Begin:
             # be correct or not. However, we can at least tell whether we're
             # going to be opening an entity where there's only one possible
             # L.
-            for span in gold.negs[:gold.nr_neg]:
-                if span.label == label and span.start == b0:
+            for span in gold.negs:
+                if span.get().label == label and span.get().start == b0:
                     cost += 1
                     break
         return cost
@@ -572,8 +574,9 @@ cdef class Last:
         # If we have negative-example entities, integrate them into the objective,
         # by marking actions that close an entity that we know is incorrect
         # as costly.
-        for span in gold.negs[:gold.nr_neg]:
-            if span.label == label and (span.end-1) == b0 and span.start == ent_start:
+        cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
                 cost += 1
                 break
         return cost
@@ -637,8 +640,9 @@ cdef class Unit:
         # This is fairly straight-forward for U- entities, as we have a single
         # action
         cdef int b0 = s.B(0)
-        for span in gold.negs[:gold.nr_neg]:
-            if span.label == label and span.start == b0 and span.end == (b0+1):
+        cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
                 cost += 1
                 break
         return cost
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 98a74bc2145..e5c71dafcf7 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -5,7 +5,8 @@
 
 from spacy.attrs import LENGTH, ORTH
 from spacy.lang.en import English
-from spacy.tokens import Doc, Span, Token
+from spacy.tokens import Doc, Span, SpanGroup, Token
+from spacy.vocab import Vocab
 from spacy.util import filter_spans
 from spacy.vocab import Vocab
 
@@ -163,16 +164,16 @@ def test_char_span(doc, i_sent, i, j, text):
         assert span.text == text
 
 
-def test_char_span_attributes(doc):
-    label = "LABEL"
-    kb_id = "KB_ID"
-    span_id = "SPAN_ID"
-    span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
-    span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
-    assert span1.text == span2.text
-    assert span1.label_ == span2.label_ == label
-    assert span1.kb_id_ == span2.kb_id_ == kb_id
-    assert span1.id_ == span2.id_ == span_id
+@pytest.mark.issue(9556)
+def test_modify_span_group(doc):
+    group = SpanGroup(doc, spans=doc.ents)
+    for span in group:
+        span.start = 0
+        span.label = doc.vocab.strings["TEST"]
+
+    # Span changes must be reflected in the span group
+    assert group[0].start == 0
+    assert group[0].label == doc.vocab.strings["TEST"]
 
 
 def test_spans_sent_spans(doc):
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index d77bbea7035..ce318ed0dfb 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,3 +1,4 @@
+from libcpp.memory cimport shared_ptr
 cimport numpy as np
 
 from ..structs cimport SpanC
@@ -7,19 +8,21 @@ from .doc cimport Doc
 
 cdef class Span:
     cdef readonly Doc doc
-    cdef SpanC c
+    cdef shared_ptr[SpanC] c
     cdef public _vector
     cdef public _vector_norm
 
     @staticmethod
-    cdef inline Span cinit(Doc doc, SpanC span):
+    cdef inline Span cinit(Doc doc, const shared_ptr[SpanC] &span):
         cdef Span self = Span.__new__(
             Span,
             doc,
-            start=span.start,
-            end=span.end
+            start=span.get().start,
+            end=span.get().end
         )
         self.c = span
         return self
 
     cpdef np.ndarray to_array(self, object features)
+
+    cdef SpanC* span_c(self)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index e179bbce7eb..17c4c4c6059 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -1,8 +1,7 @@
 # cython: profile=False
 cimport numpy as np
-
-import copy
-import warnings
+from libc.math cimport sqrt
+from libcpp.memory cimport make_shared
 
 import numpy
 from thinc.api import get_array_module
@@ -115,7 +114,7 @@ cdef class Span:
             end_char = start_char
         else:
             end_char = doc[end - 1].idx + len(doc[end - 1])
-        self.c = SpanC(
+        self.c = make_shared[SpanC](SpanC(
             label=label,
             kb_id=kb_id,
             id=span_id,
@@ -123,7 +122,7 @@ cdef class Span:
             end=end,
             start_char=start_char,
             end_char=end_char,
-        )
+        ))
         self._vector = vector
         self._vector_norm = vector_norm
 
@@ -133,32 +132,46 @@ cdef class Span:
                 return False
             else:
                 return True
-        if not isinstance(other, Span):
-            return False
-        cdef Span other_span = other
-        self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc)
-        other_tuple = (other_span.c.start_char, other_span.c.end_char, other_span.c.label, other_span.c.kb_id, other_span.id, other_span.doc)
+
+        cdef SpanC* span_c = self.span_c()
+        cdef SpanC* other_span_c = other.span_c()
+
         # <
         if op == 0:
-            return self_tuple < other_tuple
+            return span_c.start_char < other_span_c.start_char
         # <=
         elif op == 1:
-            return self_tuple <= other_tuple
+            return span_c.start_char <= other_span_c.start_char
         # ==
         elif op == 2:
-            return self_tuple == other_tuple
+            # Do the cheap comparisons first
+            return (
+                (span_c.start_char == other_span_c.start_char) and \
+                (span_c.end_char == other_span_c.end_char) and \
+                (span_c.label == other_span_c.label) and \
+                (span_c.kb_id == other_span_c.kb_id) and \
+                (self.doc == other.doc)
+            )
         # !=
         elif op == 3:
-            return self_tuple != other_tuple
+            # Do the cheap comparisons first
+            return not (
+                (span_c.start_char == other_span_c.start_char) and \
+                (span_c.end_char == other_span_c.end_char) and \
+                (span_c.label == other_span_c.label) and \
+                (span_c.kb_id == other_span_c.kb_id) and \
+                (self.doc == other.doc)
+            )
         # >
         elif op == 4:
-            return self_tuple > other_tuple
+            return span_c.start_char > other_span_c.start_char
         # >=
         elif op == 5:
-            return self_tuple >= other_tuple
+            return span_c.start_char >= other_span_c.start_char
 
     def __hash__(self):
-        return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.c.id))
+        cdef SpanC* span_c = self.span_c()
+        return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id))
 
     def __len__(self):
         """Get the number of tokens in the span.
@@ -167,9 +180,10 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#len
         """
-        if self.c.end < self.c.start:
+        cdef SpanC* span_c = self.span_c()
+        if span_c.end < span_c.start:
             return 0
-        return self.c.end - self.c.start
+        return span_c.end - span_c.start
 
     def __repr__(self):
         return self.text
@@ -183,15 +197,16 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#getitem
         """
+        cdef SpanC* span_c = self.span_c()
         if isinstance(i, slice):
             start, end = normalize_slice(len(self), i.start, i.stop, i.step)
             return Span(self.doc, start + self.start, end + self.start)
         else:
             if i < 0:
-                token_i = self.c.end + i
+                token_i = span_c.end + i
             else:
-                token_i = self.c.start + i
-            if self.c.start <= token_i < self.c.end:
+                token_i = span_c.start + i
+            if span_c.start <= token_i < span_c.end:
                 return self.doc[token_i]
             else:
                 raise IndexError(Errors.E1002)
@@ -203,7 +218,8 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#iter
         """
-        for i in range(self.c.start, self.c.end):
+        cdef SpanC* span_c = self.span_c()
+        for i in range(span_c.start, span_c.end):
             yield self.doc[i]
 
     def __reduce__(self):
@@ -211,9 +227,10 @@ cdef class Span:
 
     @property
     def _(self):
+        cdef SpanC* span_c = self.span_c()
         """Custom extension attributes registered via `set_extension`."""
         return Underscore(Underscore.span_extensions, self,
-                          start=self.c.start_char, end=self.c.end_char)
+                          start=span_c.start_char, end=span_c.end_char)
 
     def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None):
         """Create a `Doc` object with a copy of the `Span`'s data.
@@ -287,13 +304,14 @@ cdef class Span:
         cdef int length = len(array)
         cdef attr_t value
         cdef int i, head_col, ancestor_i
+        cdef SpanC* span_c = self.span_c()
         old_to_new_root = dict()
         if HEAD in attrs:
             head_col = attrs.index(HEAD)
             for i in range(length):
                 # if the HEAD refers to a token outside this span, find a more appropriate ancestor
                 token = self[i]
-                ancestor_i = token.head.i - self.c.start   # span offset
+                ancestor_i = token.head.i - span_c.start   # span offset
                 if ancestor_i not in range(length):
                     if DEP in attrs:
                         array[i, attrs.index(DEP)] = dep
@@ -301,7 +319,7 @@ cdef class Span:
                     # try finding an ancestor within this span
                     ancestors = token.ancestors
                     for ancestor in ancestors:
-                        ancestor_i = ancestor.i - self.c.start
+                        ancestor_i = ancestor.i - span_c.start
                         if ancestor_i in range(length):
                             array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
 
@@ -330,7 +348,8 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#get_lca_matrix
         """
-        return numpy.asarray(_get_lca_matrix(self.doc, self.c.start, self.c.end))
+        cdef SpanC* span_c = self.span_c()
+        return numpy.asarray(_get_lca_matrix(self.doc, span_c.start, span_c.end))
 
     def similarity(self, other):
         """Make a semantic similarity estimate. The default estimate is cosine
@@ -440,6 +459,9 @@ cdef class Span:
         else:
             raise ValueError(Errors.E030)
 
+    cdef SpanC* span_c(self):
+        return self.c.get()
+
     @property
     def sents(self):
         """Obtain the sentences that contain this span. If the given span
@@ -494,10 +516,13 @@ cdef class Span:
         DOCS: https://spacy.io/api/span#ents
         """
         cdef Span ent
+        cdef SpanC* span_c = self.span_c()
+        cdef SpanC* ent_span_c
         ents = []
         for ent in self.doc.ents:
-            if ent.c.start >= self.c.start:
-                if ent.c.end <= self.c.end:
+            ent_span_c = ent.span_c()
+            if ent_span_c.start >= span_c.start:
+                if ent_span_c.end <= span_c.end:
                     ents.append(ent)
                 else:
                     break
@@ -631,11 +656,12 @@ cdef class Span:
         # This should probably be called 'head', and the other one called
         # 'gov'. But we went with 'head' elsewhere, and now we're stuck =/
         cdef int i
+        cdef SpanC* span_c = self.span_c()
         # First, we scan through the Span, and check whether there's a word
         # with head==0, i.e. a sentence root. If so, we can return it. The
         # longer the span, the more likely it contains a sentence root, and
         # in this case we return in linear time.
-        for i in range(self.c.start, self.c.end):
+        for i in range(span_c.start, span_c.end):
             if self.doc.c[i].head == 0:
                 return self.doc[i]
         # If we don't have a sentence root, we do something that's not so
@@ -646,15 +672,15 @@ cdef class Span:
         # think this should be okay.
         cdef int current_best = self.doc.length
         cdef int root = -1
-        for i in range(self.c.start, self.c.end):
-            if self.c.start <= (i+self.doc.c[i].head) < self.c.end:
+        for i in range(span_c.start, span_c.end):
+            if span_c.start <= (i+self.doc.c[i].head) < span_c.end:
                 continue
             words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
             if words_to_root < current_best:
                 current_best = words_to_root
                 root = i
         if root == -1:
-            return self.doc[self.c.start]
+            return self.doc[span_c.start]
         else:
             return self.doc[root]
 
@@ -677,9 +703,10 @@ cdef class Span:
         span_id (Union[int, str]): An identifier to associate with the span.
         RETURNS (Span): The newly constructed object.
         """
-        start_idx += self.c.start_char
-        end_idx += self.c.start_char
-        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
+        cdef SpanC* span_c = self.span_c()
+        start_idx += span_c.start_char
+        end_idx += span_c.start_char
+        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
 
     @property
     def conjuncts(self):
@@ -759,53 +786,53 @@ cdef class Span:
 
     property start:
         def __get__(self):
-            return self.c.start
+            return self.span_c().start
 
         def __set__(self, int start):
             if start < 0:
-                raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
-            self.c.start = start
+                raise IndexError("TODO")
+            self.span_c().start = start
 
     property end:
         def __get__(self):
-            return self.c.end
+            return self.span_c().end
 
         def __set__(self, int end):
             if end < 0:
-                raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
-            self.c.end = end
+                raise IndexError("TODO")
+            self.span_c().end = end
 
     property start_char:
         def __get__(self):
-            return self.c.start_char
+            return self.span_c().start_char
 
         def __set__(self, int start_char):
             if start_char < 0:
-                raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
-            self.c.start_char = start_char
+                raise IndexError("TODO")
+            self.span_c().start_char = start_char
 
     property end_char:
         def __get__(self):
-            return self.c.end_char
+            return self.span_c().end_char
 
         def __set__(self, int end_char):
             if end_char < 0:
-                raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
-            self.c.end_char = end_char
+                raise IndexError("TODO")
+            self.span_c().end_char = end_char
 
     property label:
         def __get__(self):
-            return self.c.label
+            return self.span_c().label
 
         def __set__(self, attr_t label):
-            self.c.label = label
+            self.span_c().label = label
 
     property kb_id:
         def __get__(self):
-            return self.c.kb_id
+            return self.span_c().kb_id
 
         def __set__(self, attr_t kb_id):
-            self.c.kb_id = kb_id
+            self.span_c().kb_id = kb_id
 
     property id:
         def __get__(self):
diff --git a/spacy/tokens/span_group.pxd b/spacy/tokens/span_group.pxd
index 7f4145682eb..6f0ffd0eb36 100644
--- a/spacy/tokens/span_group.pxd
+++ b/spacy/tokens/span_group.pxd
@@ -1,3 +1,4 @@
+from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
 
 from ..structs cimport SpanC
@@ -7,6 +8,6 @@ cdef class SpanGroup:
     cdef public object _doc_ref
     cdef public str name
     cdef public dict attrs
-    cdef vector[SpanC] c
+    cdef vector[shared_ptr[SpanC]] c
 
-    cdef void push_back(self, SpanC span) nogil
+    cdef void push_back(self, const shared_ptr[SpanC] &span)
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 257c907bcce..8a524926a03 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -9,6 +9,8 @@ import srsly
 from spacy.errors import Errors
 
 from .span cimport Span
+from libc.stdint cimport uint64_t, uint32_t, int32_t
+from libcpp.memory cimport make_shared
 
 
 cdef class SpanGroup:
@@ -202,10 +204,12 @@ cdef class SpanGroup:
 
         DOCS: https://spacy.io/api/spangroup#to_bytes
         """
+        cdef SpanC* span_c
         output = {"name": self.name, "attrs": self.attrs, "spans": []}
         cdef int i
         for i in range(self.c.size()):
             span = self.c[i]
+            span_c = span.get()
             # The struct.pack here is probably overkill, but it might help if
             # you're saving tonnes of spans, and it doesn't really add any
             # complexity. We do take care to specify little-endian byte order
@@ -217,13 +221,13 @@ cdef class SpanGroup:
             # l: int32_t
             output["spans"].append(struct.pack(
                 ">QQQllll",
-                span.id,
-                span.kb_id,
-                span.label,
-                span.start,
-                span.end,
-                span.start_char,
-                span.end_char
+                span_c.id,
+                span_c.kb_id,
+                span_c.label,
+                span_c.start,
+                span_c.end,
+                span_c.start_char,
+                span_c.end_char
             ))
         return srsly.msgpack_dumps(output)
 
@@ -250,10 +254,10 @@ cdef class SpanGroup:
             span.end = items[4]
             span.start_char = items[5]
             span.end_char = items[6]
-            self.c.push_back(span)
+            self.c.push_back(make_shared[SpanC](span))
         return self
 
-    cdef void push_back(self, SpanC span) nogil:
+    cdef void push_back(self, const shared_ptr[SpanC] &span):
         self.c.push_back(span)
 
     def copy(self, doc: Optional["Doc"] = None) -> SpanGroup:

From 4d56030fc232a614fcc593b8e4e1f1c3c491e6cc Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 15 Apr 2022 15:34:58 +0200
Subject: [PATCH 115/504] Return doc offsets in Matcher on spans (#10576)

The returned match offsets were only adjusted for `as_spans`, not
generally. Because the `on_match` callbacks are always applied to the
doc, the `Matcher` matches on spans should consistently use the doc
offsets.
---
 spacy/matcher/matcher.pyx               |  7 ++++---
 spacy/tests/matcher/test_matcher_api.py | 13 ++++++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 9a9ed421223..f0116169a6b 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -275,6 +275,10 @@ cdef class Matcher:
         # non-overlapping ones this `match` can be either (start, end) or
         # (start, end, alignments) depending on `with_alignments=` option.
         for key, *match in matches:
+            # Adjust span matches to doc offsets
+            if isinstance(doclike, Span):
+                match[0] += doclike.start
+                match[1] += doclike.start
             span_filter = self._filter.get(key)
             if span_filter is not None:
                 pairs = pairs_by_id.get(key, [])
@@ -305,9 +309,6 @@ cdef class Matcher:
         if as_spans:
             final_results = []
             for key, start, end, *_ in final_matches:
-                if isinstance(doclike, Span):
-                    start += doclike.start
-                    end += doclike.start
                 final_results.append(Span(doc, start, end, label=key))
         elif with_alignments:
             # convert alignments List[Dict[str, int]] --> List[int]
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index c824ca39253..106a00b3011 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -794,9 +794,16 @@ def test_matcher_span(matcher):
     doc = Doc(matcher.vocab, words=text.split())
     span_js = doc[:3]
     span_java = doc[4:]
-    assert len(matcher(doc)) == 2
-    assert len(matcher(span_js)) == 1
-    assert len(matcher(span_java)) == 1
+    doc_matches = matcher(doc)
+    span_js_matches = matcher(span_js)
+    span_java_matches = matcher(span_java)
+    assert len(doc_matches) == 2
+    assert len(span_js_matches) == 1
+    assert len(span_java_matches) == 1
+
+    # match offsets always refer to the doc
+    assert doc_matches[0] == span_js_matches[0]
+    assert doc_matches[1] == span_java_matches[0]
 
 
 def test_matcher_as_spans(matcher):

From b4e7d910841d64a7c84cdf2e94c9a547c3ec2633 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 15 Jul 2022 11:14:08 +0200
Subject: [PATCH 116/504] `Morphology`/`Morphologizer` optimizations and
 refactoring (#11024)

* `Morphology`: Refactor to use C types, reduce allocations, remove unused code

* `Morphologzier`: Avoid unnecessary sorting of morpho features

* `Morphologizer`: Remove execessive reallocations of labels, improve hash lookups of labels, coerce `numpy` numeric types to native ints
Update docs

* Remove unused method

* Replace `unique_ptr` usage with `shared_ptr`

* Add type annotations to internal Python methods, rename `hash` variable, fix typos

* Add comment to clarify implementation detail

* Fix return type

* `Morphology`: Stop early when splitting fields and values
---
 spacy/morphology.pxd               |  47 +++--
 spacy/morphology.pyx               | 274 +++++++++++++++++------------
 spacy/pipeline/morphologizer.pyx   |  30 ++--
 spacy/structs.pxd                  |   8 -
 spacy/tokens/morphanalysis.pxd     |   9 +-
 spacy/tokens/morphanalysis.pyx     |  40 +++--
 spacy/tokens/token.pyx             |   3 +-
 website/docs/api/morphologizer.mdx |   2 +-
 8 files changed, 240 insertions(+), 173 deletions(-)

diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index ee43aa4ec81..494088879b1 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,27 +1,42 @@
 cimport numpy as np
-from cymem.cymem cimport Pool
-from libc.stdint cimport uint64_t
-from preshed.maps cimport PreshMap
+from libc.stdint cimport uint32_t, uint64_t
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector
+from libcpp.memory cimport shared_ptr
 
 from .strings cimport StringStore
 from .structs cimport MorphAnalysisC
 from .typedefs cimport attr_t, hash_t
 
 
+cdef cppclass Feature:
+    hash_t field
+    hash_t value
+
+    __init__():
+        this.field = 0
+        this.value = 0
+
+
+cdef cppclass MorphAnalysisC:
+    hash_t key  
+    vector[Feature] features
+
+    __init__():
+        this.key = 0
+
 cdef class Morphology:
-    cdef readonly Pool mem
     cdef readonly StringStore strings
-    cdef PreshMap tags  # Keyed by hash, value is pointer to tag
-
-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
-    cdef int insert(self, MorphAnalysisC tag) except -1
+    cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags
 
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash)
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats)
+    cdef hash_t _add(self, features)
+    cdef str _normalize_features(self, features)
+    cdef str get_morph_str(self, hash_t morph_key)
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key)
 
-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
-cdef list list_features(const MorphAnalysisC* morph)
-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
-cdef int get_n_by_field(
-    attr_t* results,
-    const MorphAnalysisC* morph,
-    attr_t field,
-) nogil
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph)
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field)
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index cef45b04d14..7ee621056f1 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,11 +1,11 @@
 # cython: infer_types
 # cython: profile=False
 import warnings
+from typing import Union, Tuple, List, Dict, Optional
+from cython.operator cimport dereference as deref
+from libcpp.memory cimport shared_ptr
 
-import numpy
-
-from .attrs cimport POS
-
+from .errors import Warnings
 from . import symbols
 from .errors import Warnings
 from .parts_of_speech import IDS as POS_IDS
@@ -26,135 +26,187 @@ cdef class Morphology:
     EMPTY_MORPH = symbols.NAMES[symbols._]
 
     def __init__(self, StringStore strings):
-        self.mem = Pool()
         self.strings = strings
-        self.tags = PreshMap()
 
     def __reduce__(self):
         tags = set([self.get(self.strings[s]) for s in self.strings])
         tags -= set([""])
         return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
 
-    def add(self, features):
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash):
+        match = self.tags.find(tag_hash)
+        if match != self.tags.const_end():
+            return deref(match).second
+        else:
+            return shared_ptr[MorphAnalysisC]()
+
+    def _normalize_attr(self, attr_key : Union[int, str], attr_value : Union[int, str]) -> Optional[Tuple[str, Union[str, List[str]]]]:
+        if isinstance(attr_key, (int, str)) and isinstance(attr_value, (int, str)):
+            attr_key = self.strings.as_string(attr_key)
+            attr_value = self.strings.as_string(attr_value)
+
+            # Preserve multiple values as a list
+            if self.VALUE_SEP in attr_value:
+                values = attr_value.split(self.VALUE_SEP)
+                values.sort()
+                attr_value = values
+        else:
+            warnings.warn(Warnings.W100.format(feature={attr_key: attr_value}))
+            return None
+
+        return attr_key, attr_value
+
+    def _str_to_normalized_feat_dict(self, feats: str) -> Dict[str, str]:
+        if not feats or feats == self.EMPTY_MORPH:
+            return {}
+
+        out = []
+        for feat in feats.split(self.FEATURE_SEP):
+            field, values = feat.split(self.FIELD_SEP, 1)
+            normalized_attr = self._normalize_attr(field, values)
+            if normalized_attr is None:
+                continue
+            out.append((normalized_attr[0], normalized_attr[1]))
+        out.sort(key=lambda x: x[0])
+        return dict(out)
+
+    def _dict_to_normalized_feat_dict(self, feats: Dict[Union[int, str], Union[int, str]]) -> Dict[str, str]:
+        out = []
+        for field, values in feats.items():
+            normalized_attr = self._normalize_attr(field, values)
+            if normalized_attr is None:
+                continue
+            out.append((normalized_attr[0], normalized_attr[1]))
+        out.sort(key=lambda x: x[0])
+        return dict(out)
+
+
+    def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
+        norm_feats_string = self.FEATURE_SEP.join([
+                self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
+            for field, values in feats.items()
+        ])
+        return norm_feats_string or self.EMPTY_MORPH
+
+
+    cdef hash_t _add(self, features):
         """Insert a morphological analysis in the morphology table, if not
         already present. The morphological analysis may be provided in the UD
         FEATS format as a string or in the tag map dict format.
         Returns the hash of the new analysis.
         """
-        cdef MorphAnalysisC* tag_ptr
+        cdef hash_t tag_hash = 0
+        cdef shared_ptr[MorphAnalysisC] tag
         if isinstance(features, str):
             if features == "":
                 features = self.EMPTY_MORPH
-            tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
-            if tag_ptr != NULL:
-                return tag_ptr.key
-            features = self.feats_to_dict(features)
-        if not isinstance(features, dict):
+
+            tag_hash = self.strings[features]
+            tag = self._lookup_tag(tag_hash)
+            if tag:
+                return deref(tag).key
+
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
+        else:
             warnings.warn(Warnings.W100.format(feature=features))
             features = {}
-        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # intified ("Field", "Field=Value") pairs
-        field_feature_pairs = []
-        for field in sorted(string_features):
-            values = string_features[field]
-            for value in values.split(self.VALUE_SEP):
-                field_feature_pairs.append((
-                    self.strings.add(field),
-                    self.strings.add(field + self.FIELD_SEP + value),
-                ))
-        cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
+
         # the hash key for the tag is either the hash of the normalized UFEATS
         # string or the hash of an empty placeholder
-        norm_feats_string = self.normalize_features(features)
-        tag.key = self.strings.add(norm_feats_string)
-        self.insert(tag)
-        return tag.key
+        norm_feats_string = self._normalized_feat_dict_to_str(features)
+        tag_hash = self.strings.add(norm_feats_string)
+        tag = self._lookup_tag(tag_hash)
+        if tag:
+            return deref(tag).key
+
+        self._intern_morph_tag(tag_hash, features)
+        return tag_hash
+
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats):
+        # intified ("Field", "Field=Value") pairs where fields with multiple values have
+        # been split into individual tuples, e.g.:
+        # [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
+        # ("Field2", "Field2=Value3")]
+        field_feature_pairs = []
 
-    def normalize_features(self, features):
+        # Feat dict is normalized at this point.
+        for field, values in feats.items():
+            field_key = self.strings.add(field)
+            if isinstance(values, list):
+                for value in values:
+                    value_key = self.strings.add(field + self.FIELD_SEP + value)
+                    field_feature_pairs.append((field_key, value_key))
+            else:
+                # We could box scalar values into a list and use a common
+                # code path to generate features but that incurs a small 
+                # but measurable allocation/iteration overhead (as this
+                # branch is taken often enough).
+                value_key = self.strings.add(field + self.FIELD_SEP + values)
+                field_feature_pairs.append((field_key, value_key))
+
+        num_features = len(field_feature_pairs)
+        cdef shared_ptr[MorphAnalysisC] tag = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
+        deref(tag).key = tag_key
+        deref(tag).features.resize(num_features)
+
+        for i in range(num_features):
+            deref(tag).features[i].field = field_feature_pairs[i][0]
+            deref(tag).features[i].value = field_feature_pairs[i][1]
+
+        self.tags[tag_key] = tag
+
+    cdef str get_morph_str(self, hash_t morph_key):
+        cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key)
+        if not tag:
+            return ""
+        else:
+            return self.strings[deref(tag).key]
+
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
+        return self._lookup_tag(morph_key)
+
+    cdef str _normalize_features(self, features):
         """Create a normalized FEATS string from a features string or dict.
 
         features (Union[dict, str]): Features as dict or UFEATS string.
         RETURNS (str): Features as normalized UFEATS string.
         """
         if isinstance(features, str):
-            features = self.feats_to_dict(features)
-        if not isinstance(features, dict):
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
+        else:
             warnings.warn(Warnings.W100.format(feature=features))
             features = {}
-        features = self.normalize_attrs(features)
-        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # normalized UFEATS string with sorted fields and values
-        norm_feats_string = self.FEATURE_SEP.join(
-            sorted(
-                [self.FIELD_SEP.join([field, values]) for field, values in string_features.items()]
-            )
-        )
-        return norm_feats_string or self.EMPTY_MORPH
 
-    def normalize_attrs(self, attrs):
-        """Convert attrs dict so that POS is always by ID, other features are
-        by string. Values separated by VALUE_SEP are sorted.
-        """
-        out = {}
-        attrs = dict(attrs)
-        for key, value in attrs.items():
-            # convert POS value to ID
-            if key == POS or (isinstance(key, str) and key.upper() == "POS"):
-                if isinstance(value, str) and value.upper() in POS_IDS:
-                    value = POS_IDS[value.upper()]
-                elif isinstance(value, int) and value not in POS_IDS.values():
-                    warnings.warn(Warnings.W100.format(feature={key: value}))
-                    continue
-                out[POS] = value
-            # accept any string or ID fields and values and convert to strings
-            elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
-                key = self.strings.as_string(key)
-                value = self.strings.as_string(value)
-                # sort values
-                if self.VALUE_SEP in value:
-                    value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
-                out[key] = value
-            else:
-                warnings.warn(Warnings.W100.format(feature={key: value}))
-        return out
+        return self._normalized_feat_dict_to_str(features)
 
-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
-        """Creates a MorphAnalysisC from a list of intified
-        ("Field", "Field=Value") tuples where fields with multiple values have
-        been split into individual tuples, e.g.:
-        [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
-        ("Field2", "Field2=Value3")]
-        """
-        cdef MorphAnalysisC tag
-        tag.length = len(field_feature_pairs)
-        if tag.length > 0:
-            tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
-            tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
-        for i, (field, feature) in enumerate(field_feature_pairs):
-            tag.fields[i] = field
-            tag.features[i] = feature
-        return tag
-
-    cdef int insert(self, MorphAnalysisC tag) except -1:
-        cdef hash_t key = tag.key
-        if self.tags.get(key) == NULL:
-            tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
-            tag_ptr[0] = tag
-            self.tags.set(key, <void*>tag_ptr)
-
-    def get(self, hash_t morph):
-        tag = <MorphAnalysisC*>self.tags.get(morph)
-        if tag == NULL:
-            return ""
-        else:
-            return self.strings[tag.key]
+    def add(self, features):
+        return self._add(features)
+
+    def get(self, morph_key):
+        return self.get_morph_str(morph_key)
+
+    def normalize_features(self, features):
+        return self._normalize_features(features)
 
     @staticmethod
-    def feats_to_dict(feats):
+    def feats_to_dict(feats, *, sort_values=True):
         if not feats or feats == Morphology.EMPTY_MORPH:
             return {}
-        return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
-                [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
+
+        out = {}
+        for feat in feats.split(Morphology.FEATURE_SEP):
+            field, values = feat.split(Morphology.FIELD_SEP, 1)
+            if sort_values:
+                values = values.split(Morphology.VALUE_SEP)
+                values.sort()
+                values = Morphology.VALUE_SEP.join(values)
+
+            out[field] = values
+        return out
 
     @staticmethod
     def dict_to_feats(feats_dict):
@@ -163,34 +215,34 @@ cdef class Morphology:
         return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
 
 
-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
     cdef int i
-    for i in range(morph.length):
-        if morph.features[i] == feature:
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].value == feature:
             return True
     return False
 
 
-cdef list list_features(const MorphAnalysisC* morph):
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
     cdef int i
     features = []
-    for i in range(morph.length):
-        features.append(morph.features[i])
+    for i in range(deref(morph).features.size()):
+        features.append(deref(morph).features[i].value)
     return features
 
 
-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field):
-    cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64")
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
+    cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
     n = get_n_by_field(<uint64_t*>results.data, morph, field)
     return results[:n]
 
 
-cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil:
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
     cdef int n_results = 0
     cdef int i
-    for i in range(morph.length):
-        if morph.fields[i] == field:
-            results[n_results] = morph.features[i]
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].field == field:
+            results[n_results] = deref(morph).features[i].value
             n_results += 1
     return n_results
 
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index d415ae43c5c..bdbe75fd824 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -132,8 +132,8 @@ class Morphologizer(Tagger):
 
     @property
     def labels(self):
-        """RETURNS (Tuple[str]): The labels currently added to the component."""
-        return tuple(self.cfg["labels_morph"].keys())
+        """RETURNS (Iterable[str]): The labels currently added to the component."""
+        return self.cfg["labels_morph"].keys()
 
     @property
     def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
@@ -156,7 +156,7 @@ class Morphologizer(Tagger):
         # normalize label
         norm_label = self.vocab.morphology.normalize_features(label)
         # extract separate POS and morph tags
-        label_dict = Morphology.feats_to_dict(label)
+        label_dict = Morphology.feats_to_dict(label, sort_values=False)
         pos = label_dict.get(self.POS_FEAT, "")
         if self.POS_FEAT in label_dict:
             label_dict.pop(self.POS_FEAT)
@@ -194,7 +194,7 @@ class Morphologizer(Tagger):
                         continue
                     morph = str(token.morph)
                     # create and add the combined morph+POS label
-                    morph_dict = Morphology.feats_to_dict(morph)
+                    morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                     if pos:
                         morph_dict[self.POS_FEAT] = pos
                     norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@@ -211,7 +211,7 @@ class Morphologizer(Tagger):
             for i, token in enumerate(example.reference):
                 pos = token.pos_
                 morph = str(token.morph)
-                morph_dict = Morphology.feats_to_dict(morph)
+                morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                 if pos:
                     morph_dict[self.POS_FEAT] = pos
                 norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@@ -235,26 +235,29 @@ class Morphologizer(Tagger):
         cdef Doc doc
         cdef bint overwrite = self.cfg["overwrite"]
         cdef bint extend = self.cfg["extend"]
-        labels = self.labels
+
+        # We require random access for the upcoming ops, so we need
+        # to allocate a compatible container out of the iterable.
+        labels = tuple(self.labels)
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
-                morph = labels[tag_id]
+                morph = labels[int(tag_id)]
                 # set morph
                 if doc.c[j].morph == 0 or overwrite or extend:
                     if overwrite and extend:
                         # morphologizer morph overwrites any existing features
                         # while extending
-                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
-                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
+                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False)
+                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False))
                         doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                     elif extend:
                         # existing features are preserved and any new features
                         # are added
-                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
-                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
+                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False)
+                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False))
                         doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                     else:
                         # clobber
@@ -274,8 +277,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
-                                                    label_smoothing=self.cfg["label_smoothing"])
+        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
         truths = []
         for eg in examples:
             eg_truths = []
@@ -296,7 +298,7 @@ class Morphologizer(Tagger):
                     label = None
                 # Otherwise, generate the combined label
                 else:
-                    label_dict = Morphology.feats_to_dict(morph)
+                    label_dict = Morphology.feats_to_dict(morph, sort_values=False)
                     if pos:
                         label_dict[self.POS_FEAT] = pos
                     label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 8cfcc2964f6..e7513cc11b7 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -57,14 +57,6 @@ cdef struct TokenC:
     hash_t ent_id
 
 
-cdef struct MorphAnalysisC:
-    hash_t key
-    int length
-
-    attr_t* fields
-    attr_t* features
-
-
 # Internal struct, for storage and disambiguation of entities.
 cdef struct KBEntryC:
 
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
index 728f0aaf75a..f866488ecc2 100644
--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@@ -1,9 +1,12 @@
-from ..structs cimport MorphAnalysisC
-from ..typedefs cimport hash_t
 from ..vocab cimport Vocab
+from ..typedefs cimport hash_t
+from ..morphology cimport MorphAnalysisC
+from libcpp.memory cimport shared_ptr
 
 
 cdef class MorphAnalysis:
     cdef readonly Vocab vocab
     cdef readonly hash_t key
-    cdef MorphAnalysisC c
+    cdef shared_ptr[MorphAnalysisC] c
+
+    cdef void _init_c(self, hash_t key)
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index ea5d07fa449..ceaa3ecd04e 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -8,6 +8,13 @@ from ..morphology import Morphology
 from ..morphology cimport check_feature, get_by_field, list_features
 from ..typedefs cimport attr_t, hash_t
 from ..vocab cimport Vocab
+from ..typedefs cimport hash_t, attr_t
+from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
+from libcpp.memory cimport shared_ptr
+from cython.operator cimport dereference as deref
+
+
+cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
 
 
 cdef class MorphAnalysis:
@@ -15,39 +22,38 @@ cdef class MorphAnalysis:
     def __init__(self, Vocab vocab, features=dict()):
         self.vocab = vocab
         self.key = self.vocab.morphology.add(features)
-        analysis = <const MorphAnalysisC*>self.vocab.morphology.tags.get(self.key)
-        if analysis is not NULL:
-            self.c = analysis[0]
+        self._init_c(self.key)
+
+    cdef void _init_c(self, hash_t key):
+        cdef shared_ptr[MorphAnalysisC] analysis = self.vocab.morphology.get_morph_c(key)
+        if analysis:
+            self.c = analysis
         else:
-            memset(&self.c, 0, sizeof(self.c))
+            self.c = EMPTY_MORPH_TAG
 
     @classmethod
     def from_id(cls, Vocab vocab, hash_t key):
         """Create a morphological analysis from a given ID."""
-        cdef MorphAnalysis morph = MorphAnalysis.__new__(MorphAnalysis, vocab)
+        cdef MorphAnalysis morph = MorphAnalysis(vocab)
         morph.vocab = vocab
         morph.key = key
-        analysis = <const MorphAnalysisC*>vocab.morphology.tags.get(key)
-        if analysis is not NULL:
-            morph.c = analysis[0]
-        else:
-            memset(&morph.c, 0, sizeof(morph.c))
+        morph._init_c(key)
         return morph
 
     def __contains__(self, feature):
         """Test whether the morphological analysis contains some feature."""
         cdef attr_t feat_id = self.vocab.strings.as_int(feature)
-        return check_feature(&self.c, feat_id)
+        return check_feature(self.c, feat_id)
 
     def __iter__(self):
         """Iterate over the features in the analysis."""
         cdef attr_t feature
-        for feature in list_features(&self.c):
+        for feature in list_features(self.c):
             yield self.vocab.strings[feature]
 
     def __len__(self):
         """The number of features in the analysis."""
-        return self.c.length
+        return deref(self.c).features.size()
 
     def __hash__(self):
         return self.key
@@ -63,11 +69,7 @@ cdef class MorphAnalysis:
     def get(self, field, default=None):
         """Retrieve feature values by field."""
         cdef attr_t field_id = self.vocab.strings.as_int(field)
-        cdef np.ndarray results = get_by_field(&self.c, field_id)
-        if len(results) == 0:
-            if default is None:
-                default = []
-            return default
+        cdef np.ndarray results = get_by_field(self.c, field_id)
         features = [self.vocab.strings[result] for result in results]
         return [f.split(Morphology.FIELD_SEP)[1] for f in features]
 
@@ -75,7 +77,7 @@ cdef class MorphAnalysis:
         """Produce a json serializable representation as a UD FEATS-style
         string.
         """
-        morph_string = self.vocab.strings[self.c.key]
+        morph_string = self.vocab.strings[deref(self.c).key]
         if morph_string == self.vocab.morphology.EMPTY_MORPH:
             return ""
         return morph_string
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 2ed736b7035..c0cd0af42c0 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -39,6 +39,7 @@ from .. import parts_of_speech
 from ..attrs import IOB_STRINGS
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
+from cython.operator cimport dereference as deref
 
 
 cdef class Token:
@@ -257,7 +258,7 @@ cdef class Token:
             # Check that the morph has the same vocab
             if self.vocab != morph.vocab:
                 raise ValueError(Errors.E1013)
-            self.c.morph = morph.c.key
+            self.c.morph = deref(morph.c).key
 
     def set_morph(self, features):
         cdef hash_t key
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 8f189d129c3..ce16f534219 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -402,7 +402,7 @@ coarse-grained POS as the feature `POS`.
 
 | Name        | Description                                            |
 | ----------- | ------------------------------------------------------ |
-| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
+| **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ |
 
 ## Morphologizer.label_data {id="label_data",tag="property",version="3"}
 

From 7af2a3bd4b8d305f5f612b91c3a328e566e4fa94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 29 Jul 2022 15:12:19 +0200
Subject: [PATCH 117/504] precompute_hiddens/Parser: look up CPU ops once (v4)
 (#11068)

* precompute_hiddens/Parser: look up CPU ops once

* precompute_hiddens: make cpu_ops private
---
 spacy/ml/parser_model.pyx            | 8 +++-----
 spacy/pipeline/transition_parser.pxd | 1 +
 spacy/pipeline/transition_parser.pyx | 8 ++------
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index f004c562e7d..cb323e98891 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -386,6 +386,7 @@ cdef class precompute_hiddens:
     cdef bint _is_synchronized
     cdef public object ops
     cdef public object numpy_ops
+    cdef public object _cpu_ops
     cdef np.ndarray _features
     cdef np.ndarray _cached
     cdef np.ndarray bias
@@ -416,6 +417,7 @@ cdef class precompute_hiddens:
         self.nO = cached.shape[2]
         self.ops = lower_model.ops
         self.numpy_ops = NumpyOps()
+        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
         assert activation in (None, "relu", "maxout")
         self.activation = activation
         self._is_synchronized = False
@@ -478,11 +480,7 @@ cdef class precompute_hiddens:
         # - Output from backward on GPU
         bp_hiddens = self._bp_hiddens
 
-        cdef CBlas cblas
-        if isinstance(self.ops, CupyOps):
-            cblas = NUMPY_OPS.cblas()
-        else:
-            cblas = self.ops.cblas()
+        cdef CBlas cblas = self._cpu_ops.cblas()
 
         feat_weights = self.get_feat_weights()
         cdef int[:, ::1] ids = token_ids
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index 7ddb91e0184..7ef20563b12 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -12,6 +12,7 @@ cdef class Parser(TrainablePipe):
     cdef public object _rehearsal_model
     cdef readonly TransitionSystem moves
     cdef public object _multitasks
+    cdef object _cpu_ops
 
     cdef void _parseC(
         self,
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 9a278fc1328..b8ebbf8ca88 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -135,6 +135,7 @@ cdef class Parser(TrainablePipe):
 
         self._rehearsal_model = None
         self.scorer = scorer
+        self._cpu_ops = get_ops("cpu") if isinstance(self.model.ops, CupyOps) else self.model.ops
 
     def __getnewargs_ex__(self):
         """This allows pickling the Parser and its keyword-only init arguments"""
@@ -273,12 +274,7 @@ cdef class Parser(TrainablePipe):
     def greedy_parse(self, docs, drop=0.):
         cdef vector[StateC*] states
         cdef StateClass state
-        ops = self.model.ops
-        cdef CBlas cblas
-        if isinstance(ops, CupyOps):
-            cblas = NUMPY_OPS.cblas()
-        else:
-            cblas = ops.cblas()
+        cdef CBlas cblas = self._cpu_ops.cblas()
         self._ensure_labels_are_added(docs)
         set_dropout_rate(self.model, drop)
         batch = self.moves.init_batch(docs)

From 1816a9ddaa497f5200ebc104c12ba76762226e51 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 10 Aug 2022 11:44:05 +0200
Subject: [PATCH 118/504] Rename modules for consistency (#11286)

* rename Python module to entity_ruler

* rename Python module to attribute_ruler
---
 spacy/pipeline/__init__.py                               | 6 +++---
 spacy/pipeline/{attributeruler.py => attribute_ruler.py} | 0
 spacy/pipeline/{entityruler.py => entity_ruler.py}       | 0
 website/docs/api/attributeruler.mdx                      | 6 +++---
 website/docs/api/entityruler.mdx                         | 6 +++---
 website/docs/usage/saving-loading.mdx                    | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)
 rename spacy/pipeline/{attributeruler.py => attribute_ruler.py} (100%)
 rename spacy/pipeline/{entityruler.py => entity_ruler.py} (100%)

diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 2c4a5a8a87f..82d24486a27 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -1,9 +1,9 @@
-from .attributeruler import AttributeRuler
+from .attribute_ruler import AttributeRuler
 from .dep_parser import DependencyParser
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
-from .entityruler import EntityRuler
-from .functions import merge_entities, merge_noun_chunks, merge_subtokens
+from .ner import EntityRecognizer
+from .entity_ruler import EntityRuler
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .ner import EntityRecognizer
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attribute_ruler.py
similarity index 100%
rename from spacy/pipeline/attributeruler.py
rename to spacy/pipeline/attribute_ruler.py
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entity_ruler.py
similarity index 100%
rename from spacy/pipeline/entityruler.py
rename to spacy/pipeline/entity_ruler.py
diff --git a/website/docs/api/attributeruler.mdx b/website/docs/api/attributeruler.mdx
index c1831918752..e8cb248f85b 100644
--- a/website/docs/api/attributeruler.mdx
+++ b/website/docs/api/attributeruler.mdx
@@ -1,8 +1,8 @@
 ---
 title: AttributeRuler
 tag: class
-source: spacy/pipeline/attributeruler.py
-version: 3
+source: spacy/pipeline/attribute_ruler.py
+new: 3
 teaser: 'Pipeline component for rule-based token attribute assignment'
 api_string_name: attribute_ruler
 api_trainable: false
@@ -34,7 +34,7 @@ how the component should be configured. You can override its settings via the
 | `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
 
 ```python
-%%GITHUB_SPACY/spacy/pipeline/attributeruler.py
+%%GITHUB_SPACY/spacy/pipeline/attribute_ruler.py
 ```
 
 ## AttributeRuler.\_\_init\_\_ {id="init",tag="method"}
diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx
index 27624398ec6..a35b6e2566c 100644
--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@@ -1,8 +1,8 @@
 ---
 title: EntityRuler
 tag: class
-source: spacy/pipeline/entityruler.py
-version: 2.1
+source: spacy/pipeline/entity_ruler.py
+new: 2.1
 teaser: 'Pipeline component for rule-based named entity recognition'
 api_string_name: entity_ruler
 api_trainable: false
@@ -65,7 +65,7 @@ how the component should be configured. You can override its settings via the
 | `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                 |
 
 ```python
-%%GITHUB_SPACY/spacy/pipeline/entityruler.py
+%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py
 ```
 
 ## EntityRuler.\_\_init\_\_ {id="init",tag="method"}
diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx
index 9a6791d5e0a..b44bd86ed06 100644
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@@ -189,7 +189,7 @@ the data to and from a JSON file.
 >
 > To see custom serialization methods in action, check out the new
 > [`EntityRuler`](/api/entityruler) component and its
-> [source](%%GITHUB_SPACY/spacy/pipeline/entityruler.py). Patterns added to the
+> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the
 > component will be saved to a `.jsonl` file if the pipeline is serialized to
 > disk, and to a bytestring if the pipeline is serialized to bytes. This allows
 > saving out a pipeline with a rule-based entity recognizer and including all

From 84b8bdf7cd6fa76b06b127cdbca0c024df9a5711 Mon Sep 17 00:00:00 2001
From: antonpibm <51074867+antonpibm@users.noreply.github.com>
Date: Thu, 11 Aug 2022 12:26:26 +0300
Subject: [PATCH 119/504] Match private networks as URLs (#11121)

---
 spacy/lang/tokenizer_exceptions.py | 4 ----
 spacy/tests/tokenizer/test_urls.py | 5 ++++-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index dbf9aab4912..a612ae8ac7e 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -16,10 +16,6 @@
     r"(?:\S+(?::\S*)?@)?"
     r"(?:"
     # IP address exclusion
-    # private & local networks
-    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
-    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
-    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
     # IP address dotted notation octets
     # excludes loopback network 0.0.0.0
     # excludes reserved space >= 224.0.0.0
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index ff8812be183..4753462a506 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -32,6 +32,9 @@
     "http://userid:password@example.com/",
     "http://142.42.1.1/",
     "http://142.42.1.1:8080/",
+    "http://10.140.12.13/foo",
+    "http://10.140.12.13/foo/bar?arg1=baz&arg2=taz",
+    "http://10.1.1.1",
     "http://foo.com/blah_(wikipedia)#cite-1",
     "http://foo.com/blah_(wikipedia)_blah#cite-1",
     "http://foo.com/unicode_(✪)_in_parens",
@@ -93,6 +96,7 @@
     "http://foo.bar/foo(bar)baz quux",
     "http://-error-.invalid/",
     "http://a.b-.co",
+    # Loopback and broadcast addresses should be excluded
     "http://0.0.0.0",
     "http://10.1.1.0",
     "http://10.1.1.255",
@@ -101,7 +105,6 @@
     "http://3628126748",
     "http://.www.foo.bar/",
     "http://.www.foo.bar./",
-    "http://10.1.1.1",
     "NASDAQ:GOOG",
     "http://-a.b.co",
     pytest.param("foo.com", marks=pytest.mark.xfail()),

From 360a674e4996f8534920c23b0b464e5de420d7dd Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 17 Aug 2022 12:13:54 +0200
Subject: [PATCH 120/504] Remove intify_attrs(_do_deprecated) (#11319)

---
 spacy/attrs.pyx                | 71 +---------------------------------
 spacy/tests/lang/test_attrs.py |  8 ----
 spacy/tokenizer.pyx            |  4 +-
 spacy/vocab.pyx                |  3 +-
 4 files changed, 4 insertions(+), 82 deletions(-)

diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 363dd094dcd..0a4aecc5d85 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -98,7 +98,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
 locals().update(IDS)
 
 
-def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
+def intify_attrs(stringy_attrs, strings_map=None):
     """
     Normalize a dictionary of attributes, converting them to ints.
 
@@ -110,75 +110,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
         converted to ints.
     """
     inty_attrs = {}
-    if _do_deprecated:
-        if "F" in stringy_attrs:
-            stringy_attrs["ORTH"] = stringy_attrs.pop("F")
-        if "L" in stringy_attrs:
-            stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
-        if "pos" in stringy_attrs:
-            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
-        if "morph" in stringy_attrs:
-            morphs = stringy_attrs.pop("morph")  # no-cython-lint
-        if "number" in stringy_attrs:
-            stringy_attrs.pop("number")
-        if "tenspect" in stringy_attrs:
-            stringy_attrs.pop("tenspect")
-        morph_keys = [
-            "PunctType",
-            "PunctSide",
-            "Other",
-            "Degree",
-            "AdvType",
-            "Number",
-            "VerbForm",
-            "PronType",
-            "Aspect",
-            "Tense",
-            "PartType",
-            "Poss",
-            "Hyph",
-            "ConjType",
-            "NumType",
-            "Foreign",
-            "VerbType",
-            "NounType",
-            "Gender",
-            "Mood",
-            "Negative",
-            "Tense",
-            "Voice",
-            "Abbr",
-            "Derivation",
-            "Echo",
-            "Foreign",
-            "NameType",
-            "NounType",
-            "NumForm",
-            "NumValue",
-            "PartType",
-            "Polite",
-            "StyleVariant",
-            "PronType",
-            "AdjType",
-            "Person",
-            "Variant",
-            "AdpType",
-            "Reflex",
-            "Negative",
-            "Mood",
-            "Aspect",
-            "Case",
-            "Polarity",
-            "PrepCase",
-            "Animacy",  # U20
-        ]
-        for key in morph_keys:
-            if key in stringy_attrs:
-                stringy_attrs.pop(key)
-            elif key.lower() in stringy_attrs:
-                stringy_attrs.pop(key.lower())
-            elif key.upper() in stringy_attrs:
-                stringy_attrs.pop(key.upper())
     for name, value in stringy_attrs.items():
         int_key = intify_attr(name)
         if int_key is not None:
diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py
index fd96e8f9bd4..0f52c3ed511 100644
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@@ -31,14 +31,6 @@ def test_attrs_idempotence(text):
     assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
 
 
-@pytest.mark.parametrize("text", ["dog"])
-def test_attrs_do_deprecated(text):
-    int_attrs = intify_attrs(
-        {"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True
-    )
-    assert int_attrs == {ORTH: 10, IS_ALPHA: True}
-
-
 def test_attrs_ent_iob_intify():
     int_attrs = intify_attrs({"ENT_IOB": ""})
     assert int_attrs == {ENT_IOB: 0}
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 6f2b10734c5..c95392a2026 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -579,7 +579,7 @@ cdef class Tokenizer:
         substrings (iterable): A sequence of dicts, where each dict describes
             a token and its attributes.
         """
-        attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings]
+        attrs = [intify_attrs(spec) for spec in substrings]
         orth = "".join([spec[ORTH] for spec in attrs])
         if chunk != orth:
             raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
@@ -647,7 +647,7 @@ cdef class Tokenizer:
             url_match = re.compile("a^").match
         special_cases = {}
         for orth, special_tokens in self.rules.items():
-            special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
+            special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings) for special_token in special_tokens]
         tokens = []
         for substring in text.split():
             suffixes = []
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 4004a70e034..c03226e2467 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -273,8 +273,7 @@ cdef class Vocab:
         cdef int i
         tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
         for i, props in enumerate(substrings):
-            props = intify_attrs(props, strings_map=self.strings,
-                                 _do_deprecated=True)
+            props = intify_attrs(props, strings_map=self.strings)
             token = &tokens[i]
             # Set the special tokens up to have arbitrary attributes
             lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])

From 5abfa82155d516d5688779e2b40d255d9ec437b4 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 22 Aug 2022 15:52:24 +0200
Subject: [PATCH 121/504] Cleanup Cython structs (#11337)

* cleanup Tokenizer fields

* remove unused object from vocab

* remove IS_OOV_DEPRECATED

* add back in as FLAG13

* FLAG 18 instead

* import fix

* fix clumpsy fingers

* revert symbol changes in favor of #11352

* bint instead of bool
---
 spacy/tokenizer.pxd |  6 +-----
 spacy/tokenizer.pyx | 11 +++++++++--
 spacy/vocab.pxd     |  1 -
 spacy/vocab.pyi     |  1 -
 spacy/vocab.pyx     |  7 ++-----
 5 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index a902ebad941..f64e0e93413 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -23,11 +23,7 @@ cdef class Tokenizer:
     cdef object _infix_finditer
     cdef object _rules
     cdef PhraseMatcher _special_matcher
-    # TODO convert to bool in v4
-    cdef int _faster_heuristics
-    # TODO next one is unused and should be removed in v4
-    # https://github.com/explosion/spaCy/pull/9150
-    cdef int _unused_int2
+    cdef bint _faster_heuristics
 
     cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
     cdef int _apply_special_cases(self, Doc doc) except -1
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index c95392a2026..9b79207f82e 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -8,11 +8,18 @@ from libcpp.set cimport set as stdset
 from preshed.maps cimport PreshMap
 
 import re
+
+from .tokens.doc cimport Doc
+from .strings cimport hash_string
 from .lexeme cimport EMPTY_LEXEME
 from .strings cimport hash_string
 from .tokens.doc cimport Doc
 
+from .attrs import intify_attrs
+from .symbols import ORTH, NORM
+from .errors import Errors
 from . import util
+from .util import get_words_and_spaces
 from .attrs import intify_attrs
 from .errors import Errors
 from .scorer import Scorer
@@ -124,10 +131,10 @@ cdef class Tokenizer:
 
     property faster_heuristics:
         def __get__(self):
-            return bool(self._faster_heuristics)
+            return self._faster_heuristics
 
         def __set__(self, faster_heuristics):
-            self._faster_heuristics = bool(faster_heuristics)
+            self._faster_heuristics = faster_heuristics
             self._reload_special_cases()
 
     def __reduce__(self):
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 43e47af1dee..b91ce3ab45b 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -32,7 +32,6 @@ cdef class Vocab:
     cdef public object writing_system
     cdef public object get_noun_chunks
     cdef readonly int length
-    cdef public object _unused_object  # TODO remove in v4, see #9150
     cdef public object lex_attr_getters
     cdef public object cfg
 
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
index b7ff20348a0..7f5f23e7847 100644
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@@ -73,7 +73,6 @@ def unpickle_vocab(
     sstore: StringStore,
     vectors: Any,
     morphology: Any,
-    _unused_object: Any,
     lex_attr_getters: Any,
     lookups: Any,
     get_noun_chunks: Any,
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index c03226e2467..834f21c35dc 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -579,21 +579,18 @@ def pickle_vocab(vocab):
     sstore = vocab.strings
     vectors = vocab.vectors
     morph = vocab.morphology
-    _unused_object = vocab._unused_object
     lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
     lookups = vocab.lookups
     get_noun_chunks = vocab.get_noun_chunks
     return (unpickle_vocab,
-            (sstore, vectors, morph, _unused_object, lex_attr_getters, lookups, get_noun_chunks))
+            (sstore, vectors, morph, lex_attr_getters, lookups, get_noun_chunks))
 
 
-def unpickle_vocab(sstore, vectors, morphology, _unused_object,
-                   lex_attr_getters, lookups, get_noun_chunks):
+def unpickle_vocab(sstore, vectors, morphology, lex_attr_getters, lookups, get_noun_chunks):
     cdef Vocab vocab = Vocab()
     vocab.vectors = vectors
     vocab.strings = sstore
     vocab.morphology = morphology
-    vocab._unused_object = _unused_object
     vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
     vocab.lookups = lookups
     vocab.get_noun_chunks = get_noun_chunks

From a9a65101293ddf97e5b08ac8f599e2131b2494d9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 22 Aug 2022 20:28:57 +0200
Subject: [PATCH 122/504] Make Span/Doc.ents more consistent for ent_kb_id and
 ent_id (#11328)

* Map `Span.id` to `Token.ent_id` in all cases when setting `Doc.ents`
* Reset `Token.ent_id` and `Token.ent_kb_id` when setting `Doc.ents`
* Make `Span.ent_id` an alias of `Span.id` rather than a read-only view
of the root token's `ent_id` annotation
---
 spacy/tests/doc/test_add_entities.py       |  27 ++++
 spacy/tests/doc/test_span.py               |  56 +++-----
 spacy/tokens/doc.pyx                       |  12 +-
 spacy/tokens/span.pyi                      |  24 ++--
 spacy/tokens/span.pyx                      |  35 ++---
 website/docs/api/span.mdx                  |  46 +++----
 website/docs/api/token.mdx                 | 144 ++++++++++-----------
 website/docs/usage/rule-based-matching.mdx |   6 +-
 8 files changed, 186 insertions(+), 164 deletions(-)

diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 259b21fb3dd..586b8a745f6 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -46,6 +46,33 @@ def test_ents_reset(en_vocab):
     assert [t.ent_iob_ for t in doc] == orig_iobs
 
 
+def test_ents_clear(en_vocab):
+    """Ensure that removing entities clears token attributes"""
+    text = ["Louisiana", "Office", "of", "Conservation"]
+    doc = Doc(en_vocab, words=text)
+    entity = Span(doc, 0, 4, label=391, span_id="TEST")
+    doc.ents = [entity]
+    doc.ents = []
+    for token in doc:
+        assert token.ent_iob == 2
+        assert token.ent_type == 0
+        assert token.ent_id == 0
+        assert token.ent_kb_id == 0
+    doc.ents = [entity]
+    doc.set_ents([], default="missing")
+    for token in doc:
+        assert token.ent_iob == 0
+        assert token.ent_type == 0
+        assert token.ent_id == 0
+        assert token.ent_kb_id == 0
+    doc.set_ents([], default="blocked")
+    for token in doc:
+        assert token.ent_iob == 3
+        assert token.ent_type == 0
+        assert token.ent_id == 0
+        assert token.ent_kb_id == 0
+
+
 def test_add_overlapping_entities(en_vocab):
     text = ["Louisiana", "Office", "of", "Conservation"]
     doc = Doc(en_vocab, words=text)
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index e5c71dafcf7..ab8538b17dc 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -703,41 +703,21 @@ def test_span_group_copy(doc):
     assert len(doc_copy.spans["test"]) == 2
 
 
-def test_for_partial_ent_sents():
-    """Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
-    which this tests for.
-    """
-    doc = Doc(
-        English().vocab,
-        words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
-        sent_starts=[1, 0, 0, 1, 0, 0],
-    )
-    doc.set_ents([Span(doc, 1, 4, "WORK")])
-    # The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
-    # equal to the sentences referenced in ent.sents.
-    for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
-        assert doc_sent == ent_sent
-
-
-def test_for_no_ent_sents():
-    """Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
-    sentence.
-    """
-    doc = Doc(
-        English().vocab,
-        words=["This", "is", "a", "test.", "ENTITY"],
-        sent_starts=[1, 0, 0, 0, 1],
-    )
-    doc.set_ents([Span(doc, 4, 5, "WORK")])
-    sents = list(doc.ents[0].sents)
-    assert len(sents) == 1
-    assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
-
-
-def test_span_api_richcmp_other(en_tokenizer):
-    doc1 = en_tokenizer("a b")
-    doc2 = en_tokenizer("b c")
-    assert not doc1[1:2] == doc1[1]
-    assert not doc1[1:2] == doc2[0]
-    assert not doc1[1:2] == doc2[0:1]
-    assert not doc1[0:1] == doc2
+@pytest.mark.issue(11113)
+def test_span_ent_id(en_tokenizer):
+    doc = en_tokenizer("a b c d")
+    doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")]
+    span = doc.ents[0]
+    assert doc[1].ent_id_ == "ID0"
+
+    # setting Span.id sets Token.ent_id
+    span.id_ = "ID1"
+    doc.ents = [span]
+    assert doc.ents[0].ent_id_ == "ID1"
+    assert doc[1].ent_id_ == "ID1"
+
+    # Span.ent_id is an alias of Span.id
+    span.ent_id_ = "ID2"
+    doc.ents = [span]
+    assert doc.ents[0].ent_id_ == "ID2"
+    assert doc[1].ent_id_ == "ID2"
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 181c0ce0fce..50fc6e536c2 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -841,27 +841,33 @@ cdef class Doc:
                     self.c[i].ent_iob = 1
                 self.c[i].ent_type = span.label
                 self.c[i].ent_kb_id = span.kb_id
-                # for backwards compatibility in v3, only set ent_id from
-                # span.id if it's set, otherwise don't override
-                self.c[i].ent_id = span.id if span.id else self.c[i].ent_id
+                self.c[i].ent_id = span.id
         for span in blocked:
             for i in range(span.start, span.end):
                 self.c[i].ent_iob = 3
                 self.c[i].ent_type = 0
+                self.c[i].ent_kb_id = 0
+                self.c[i].ent_id = 0
         for span in missing:
             for i in range(span.start, span.end):
                 self.c[i].ent_iob = 0
                 self.c[i].ent_type = 0
+                self.c[i].ent_kb_id = 0
+                self.c[i].ent_id = 0
         for span in outside:
             for i in range(span.start, span.end):
                 self.c[i].ent_iob = 2
                 self.c[i].ent_type = 0
+                self.c[i].ent_kb_id = 0
+                self.c[i].ent_id = 0
 
         # Set tokens outside of all provided spans
         if default != SetEntsDefault.unmodified:
             for i in range(self.length):
                 if i not in seen_tokens:
                     self.c[i].ent_type = 0
+                    self.c[i].ent_kb_id = 0
+                    self.c[i].ent_id = 0
                     if default == SetEntsDefault.outside:
                         self.c[i].ent_iob = 2
                     elif default == SetEntsDefault.missing:
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index b982eb810b8..a6731d1c2d4 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -125,15 +125,23 @@ class Span:
     end: int
     start_char: int
     end_char: int
-    label: int
-    kb_id: int
-    id: int
-    ent_id: int
-    ent_id_: str
+    @property
+    def label(self) -> int: ...
+    @property
+    def kb_id(self) -> int: ...
+    @property
+    def id(self) -> int: ...
+    @property
+    def ent_id(self) -> int: ...
     @property
     def orth_(self) -> str: ...
     @property
     def lemma_(self) -> str: ...
-    label_: str
-    kb_id_: str
-    id_: str
+    @property
+    def label_(self) -> str: ...
+    @property
+    def kb_id_(self) -> str: ...
+    @property
+    def id_(self) -> str: ...
+    @property
+    def ent_id_(self) -> str: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 17c4c4c6059..b212b4c4303 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -836,26 +836,18 @@ cdef class Span:
 
     property id:
         def __get__(self):
-            return self.c.id
+            return self.span_c().id
 
         def __set__(self, attr_t id):
-            self.c.id = id
+            self.span_c().id = id
 
     property ent_id:
-        """RETURNS (uint64): The entity ID."""
+        """Alias for the span's ID."""
         def __get__(self):
-            return self.root.ent_id
+            return self.id
 
-        def __set__(self, hash_t key):
-            raise NotImplementedError(Errors.E200.format(attr="ent_id"))
-
-    property ent_id_:
-        """RETURNS (str): The (string) entity ID."""
-        def __get__(self):
-            return self.root.ent_id_
-
-        def __set__(self, str key):
-            raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
+        def __set__(self, attr_t ent_id):
+            self.id = ent_id
 
     @property
     def orth_(self):
@@ -871,7 +863,7 @@ cdef class Span:
         return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
 
     property label_:
-        """RETURNS (str): The span's label."""
+        """The span's label."""
         def __get__(self):
             return self.doc.vocab.strings[self.label]
 
@@ -879,7 +871,7 @@ cdef class Span:
             self.label = self.doc.vocab.strings.add(label_)
 
     property kb_id_:
-        """RETURNS (str): The span's KB ID."""
+        """The span's KB ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.kb_id]
 
@@ -887,13 +879,22 @@ cdef class Span:
             self.kb_id = self.doc.vocab.strings.add(kb_id_)
 
     property id_:
-        """RETURNS (str): The span's ID."""
+        """The span's ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.id]
 
         def __set__(self, str id_):
             self.id = self.doc.vocab.strings.add(id_)
 
+    property ent_id_:
+        """Alias for the span's ID."""
+        def __get__(self):
+            return self.id_
+
+        def __set__(self, str ent_id_):
+            self.id_ = ent_id_
+
+
 
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index 41422a5b4e1..5e7495f17ca 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -547,26 +547,26 @@ overlaps with will be returned.
 
 ## Attributes {id="attributes"}
 
-| Name           | Description                                                                                                                   |
-| -------------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| `doc`          | The parent document. ~~Doc~~                                                                                                  |
-| `tensor`       | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                              |
-| `start`        | The token offset for the start of the span. ~~int~~                                                                           |
-| `end`          | The token offset for the end of the span. ~~int~~                                                                             |
-| `start_char`   | The character offset for the start of the span. ~~int~~                                                                       |
-| `end_char`     | The character offset for the end of the span. ~~int~~                                                                         |
-| `text`         | A string representation of the span text. ~~str~~                                                                             |
-| `text_with_ws` | The text content of the span with a trailing whitespace character if the last token has one. ~~str~~                          |
-| `orth`         | ID of the verbatim text content. ~~int~~                                                                                      |
-| `orth_`        | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. ~~str~~            |
-| `label`        | The hash value of the span's label. ~~int~~                                                                                   |
-| `label_`       | The span's label. ~~str~~                                                                                                     |
-| `lemma_`       | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~                                      |
-| `kb_id`        | The hash value of the knowledge base ID referred to by the span. ~~int~~                                                      |
-| `kb_id_`       | The knowledge base ID referred to by the span. ~~str~~                                                                        |
-| `ent_id`       | The hash value of the named entity the root token is an instance of. ~~int~~                                                  |
-| `ent_id_`      | The string ID of the named entity the root token is an instance of. ~~str~~                                                   |
-| `id`           | The hash value of the span's ID. ~~int~~                                                                                      |
-| `id_`          | The span's ID. ~~str~~                                                                                                        |
-| `sentiment`    | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
-| `_`            | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
+| Name                                    | Description                                                                                                                   |
+| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| `doc`                                   | The parent document. ~~Doc~~                                                                                                  |
+| `tensor` <Tag variant="new">2.1.7</Tag> | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                              |
+| `start`                                 | The token offset for the start of the span. ~~int~~                                                                           |
+| `end`                                   | The token offset for the end of the span. ~~int~~                                                                             |
+| `start_char`                            | The character offset for the start of the span. ~~int~~                                                                       |
+| `end_char`                              | The character offset for the end of the span. ~~int~~                                                                         |
+| `text`                                  | A string representation of the span text. ~~str~~                                                                             |
+| `text_with_ws`                          | The text content of the span with a trailing whitespace character if the last token has one. ~~str~~                          |
+| `orth`                                  | ID of the verbatim text content. ~~int~~                                                                                      |
+| `orth_`                                 | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. ~~str~~            |
+| `label`                                 | The hash value of the span's label. ~~int~~                                                                                   |
+| `label_`                                | The span's label. ~~str~~                                                                                                     |
+| `lemma_`                                | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~                                      |
+| `kb_id`                                 | The hash value of the knowledge base ID referred to by the span. ~~int~~                                                      |
+| `kb_id_`                                | The knowledge base ID referred to by the span. ~~str~~                                                                        |
+| `ent_id`                                | Alias for `id`: the hash value of the span's ID. ~~int~~                                                                      |
+| `ent_id_`                               | Alias for `id_`: the span's ID. ~~str~~                                                                                       |
+| `id`                                    | The hash value of the span's ID. ~~int~~                                                                                      |
+| `id_`                                   | The span's ID. ~~str~~                                                                                                        |
+| `sentiment`                             | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
+| `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
diff --git a/website/docs/api/token.mdx b/website/docs/api/token.mdx
index 63ee1080bf1..12b99394350 100644
--- a/website/docs/api/token.mdx
+++ b/website/docs/api/token.mdx
@@ -403,75 +403,75 @@ The L2 norm of the token's vector representation.
 
 ## Attributes {id="attributes"}
 
-| Name                               | Description                                                                                                                                                                                                                                                          |
-| ---------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `doc`                              | The parent document. ~~Doc~~                                                                                                                                                                                                                                         |
-| `lex` <Tag variant="new">3</Tag>   | The underlying lexeme. ~~Lexeme~~                                                                                                                                                                                                                                    |
-| `sent`                             | The sentence span that this token is a part of. ~~Span~~                                                                                                                                                                                                             |
-| `text`                             | Verbatim text content. ~~str~~                                                                                                                                                                                                                                       |
-| `text_with_ws`                     | Text content, with trailing space character if present. ~~str~~                                                                                                                                                                                                      |
-| `whitespace_`                      | Trailing space character if present. ~~str~~                                                                                                                                                                                                                         |
-| `orth`                             | ID of the verbatim text content. ~~int~~                                                                                                                                                                                                                             |
-| `orth_`                            | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~                                                                                                                                                  |
-| `vocab`                            | The vocab object of the parent `Doc`. ~~vocab~~                                                                                                                                                                                                                      |
-| `tensor`                           | The token's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                                                                                                                                                                    |
-| `head`                             | The syntactic parent, or "governor", of this token. ~~Token~~                                                                                                                                                                                                        |
-| `left_edge`                        | The leftmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                  |
-| `right_edge`                       | The rightmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                 |
-| `i`                                | The index of the token within the parent document. ~~int~~                                                                                                                                                                                                           |
-| `ent_type`                         | Named entity type. ~~int~~                                                                                                                                                                                                                                           |
-| `ent_type_`                        | Named entity type. ~~str~~                                                                                                                                                                                                                                           |
-| `ent_iob`                          | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~                                                                                 |
-| `ent_iob_`                         | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~                                                                                  |
-| `ent_kb_id`                        | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~                                                                                                                                                                           |
-| `ent_kb_id_`                       | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~                                                                                                                                                                           |
-| `ent_id`                           | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~                                                                                                                                        |
-| `ent_id_`                          | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~                                                                                                                                        |
-| `lemma`                            | Base form of the token, with no inflectional suffixes. ~~int~~                                                                                                                                                                                                       |
-| `lemma_`                           | Base form of the token, with no inflectional suffixes. ~~str~~                                                                                                                                                                                                       |
-| `norm`                             | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~                                                                                                   |
-| `norm_`                            | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~                                                                                                   |
-| `lower`                            | Lowercase form of the token. ~~int~~                                                                                                                                                                                                                                 |
-| `lower_`                           | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~                                                                                                                                                                                        |
-| `shape`                            | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
-| `shape_`                           | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
-| `prefix`                           | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~                                                                                                                                                                           |
-| `prefix_`                          | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~                                                                                                                                                                                         |
-| `suffix`                           | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~                                                                                                                                                                             |
-| `suffix_`                          | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~                                                                                                                                                                                             |
-| `is_alpha`                         | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~                                                                                                                                                                      |
-| `is_ascii`                         | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~                                                                                                                                                          |
-| `is_digit`                         | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~                                                                                                                                                                                     |
-| `is_lower`                         | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~                                                                                                                                                                                            |
-| `is_upper`                         | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~                                                                                                                                                                                            |
-| `is_title`                         | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~                                                                                                                                                                                            |
-| `is_punct`                         | Is the token punctuation? ~~bool~~                                                                                                                                                                                                                                   |
-| `is_left_punct`                    | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~                                                                                                                                                                                                          |
-| `is_right_punct`                   | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~                                                                                                                                                                                                         |
-| `is_sent_start`                    | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`.                                                                                                                                                 |
-| `is_sent_end`                      | Does the token end a sentence? ~~bool~~ or `None` if unknown.                                                                                                                                                                                                        |
-| `is_space`                         | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~                                                                                                                                                                      |
-| `is_bracket`                       | Is the token a bracket? ~~bool~~                                                                                                                                                                                                                                     |
-| `is_quote`                         | Is the token a quotation mark? ~~bool~~                                                                                                                                                                                                                              |
-| `is_currency`                      | Is the token a currency symbol? ~~bool~~                                                                                                                                                                                                                             |
-| `like_url`                         | Does the token resemble a URL? ~~bool~~                                                                                                                                                                                                                              |
-| `like_num`                         | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~                                                                                                                                                                                           |
-| `like_email`                       | Does the token resemble an email address? ~~bool~~                                                                                                                                                                                                                   |
-| `is_oov`                           | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~                                                                                                                                                                                       |
-| `is_stop`                          | Is the token part of a "stop list"? ~~bool~~                                                                                                                                                                                                                         |
-| `pos`                              | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~                                                                                                                                                    |
-| `pos_`                             | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~                                                                                                                                                    |
-| `tag`                              | Fine-grained part-of-speech. ~~int~~                                                                                                                                                                                                                                 |
-| `tag_`                             | Fine-grained part-of-speech. ~~str~~                                                                                                                                                                                                                                 |
-| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~                                                                                                                                                                                                                            |
-| `dep`                              | Syntactic dependency relation. ~~int~~                                                                                                                                                                                                                               |
-| `dep_`                             | Syntactic dependency relation. ~~str~~                                                                                                                                                                                                                               |
-| `lang`                             | Language of the parent document's vocabulary. ~~int~~                                                                                                                                                                                                                |
-| `lang_`                            | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
-| `prob`                             | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
-| `idx`                              | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
-| `sentiment`                        | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
-| `lex_id`                           | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
-| `rank`                             | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
-| `cluster`                          | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
-| `_`                                | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                                                                                                                                        |
+| Name                                         | Description                                                                                                                                                                                                                                                          |
+| -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `doc`                                        | The parent document. ~~Doc~~                                                                                                                                                                                                                                         |
+| `lex` <Tag variant="new">3</Tag>             | The underlying lexeme. ~~Lexeme~~                                                                                                                                                                                                                                    |
+| `sent` <Tag variant="new">2.0.12</Tag>       | The sentence span that this token is a part of. ~~Span~~                                                                                                                                                                                                             |
+| `text`                                       | Verbatim text content. ~~str~~                                                                                                                                                                                                                                       |
+| `text_with_ws`                               | Text content, with trailing space character if present. ~~str~~                                                                                                                                                                                                      |
+| `whitespace_`                                | Trailing space character if present. ~~str~~                                                                                                                                                                                                                         |
+| `orth`                                       | ID of the verbatim text content. ~~int~~                                                                                                                                                                                                                             |
+| `orth_`                                      | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~                                                                                                                                                  |
+| `vocab`                                      | The vocab object of the parent `Doc`. ~~vocab~~                                                                                                                                                                                                                      |
+| `tensor` <Tag variant="new">2.1.7</Tag>      | The token's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                                                                                                                                                                    |
+| `head`                                       | The syntactic parent, or "governor", of this token. ~~Token~~                                                                                                                                                                                                        |
+| `left_edge`                                  | The leftmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                  |
+| `right_edge`                                 | The rightmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                 |
+| `i`                                          | The index of the token within the parent document. ~~int~~                                                                                                                                                                                                           |
+| `ent_type`                                   | Named entity type. ~~int~~                                                                                                                                                                                                                                           |
+| `ent_type_`                                  | Named entity type. ~~str~~                                                                                                                                                                                                                                           |
+| `ent_iob`                                    | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~                                                                                 |
+| `ent_iob_`                                   | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~                                                                                  |
+| `ent_kb_id` <Tag variant="new">2.2</Tag>     | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~                                                                                                                                                                           |
+| `ent_kb_id_` <Tag variant="new">2.2</Tag>    | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~                                                                                                                                                                           |
+| `ent_id`                                     | ID of the entity the token is an instance of, if any. ~~int~~                                                                                                                                                                                                        |
+| `ent_id_`                                    | ID of the entity the token is an instance of, if any. ~~str~~                                                                                                                                                                                                        |
+| `lemma`                                      | Base form of the token, with no inflectional suffixes. ~~int~~                                                                                                                                                                                                       |
+| `lemma_`                                     | Base form of the token, with no inflectional suffixes. ~~str~~                                                                                                                                                                                                       |
+| `norm`                                       | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~                                                                                                   |
+| `norm_`                                      | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~                                                                                                   |
+| `lower`                                      | Lowercase form of the token. ~~int~~                                                                                                                                                                                                                                 |
+| `lower_`                                     | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~                                                                                                                                                                                        |
+| `shape`                                      | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
+| `shape_`                                     | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
+| `prefix`                                     | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~                                                                                                                                                                           |
+| `prefix_`                                    | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~                                                                                                                                                                                         |
+| `suffix`                                     | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~                                                                                                                                                                             |
+| `suffix_`                                    | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~                                                                                                                                                                                             |
+| `is_alpha`                                   | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~                                                                                                                                                                      |
+| `is_ascii`                                   | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~                                                                                                                                                          |
+| `is_digit`                                   | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~                                                                                                                                                                                     |
+| `is_lower`                                   | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~                                                                                                                                                                                            |
+| `is_upper`                                   | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~                                                                                                                                                                                            |
+| `is_title`                                   | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~                                                                                                                                                                                            |
+| `is_punct`                                   | Is the token punctuation? ~~bool~~                                                                                                                                                                                                                                   |
+| `is_left_punct`                              | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~                                                                                                                                                                                                          |
+| `is_right_punct`                             | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~                                                                                                                                                                                                         |
+| `is_sent_start`                              | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`.                                                                                                                                                 |
+| `is_sent_end`                                | Does the token end a sentence? ~~bool~~ or `None` if unknown.                                                                                                                                                                                                        |
+| `is_space`                                   | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~                                                                                                                                                                      |
+| `is_bracket`                                 | Is the token a bracket? ~~bool~~                                                                                                                                                                                                                                     |
+| `is_quote`                                   | Is the token a quotation mark? ~~bool~~                                                                                                                                                                                                                              |
+| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the token a currency symbol? ~~bool~~                                                                                                                                                                                                                             |
+| `like_url`                                   | Does the token resemble a URL? ~~bool~~                                                                                                                                                                                                                              |
+| `like_num`                                   | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~                                                                                                                                                                                           |
+| `like_email`                                 | Does the token resemble an email address? ~~bool~~                                                                                                                                                                                                                   |
+| `is_oov`                                     | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~                                                                                                                                                                                       |
+| `is_stop`                                    | Is the token part of a "stop list"? ~~bool~~                                                                                                                                                                                                                         |
+| `pos`                                        | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~                                                                                                                                                    |
+| `pos_`                                       | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~                                                                                                                                                    |
+| `tag`                                        | Fine-grained part-of-speech. ~~int~~                                                                                                                                                                                                                                 |
+| `tag_`                                       | Fine-grained part-of-speech. ~~str~~                                                                                                                                                                                                                                 |
+| `morph` <Tag variant="new">3</Tag>           | Morphological analysis. ~~MorphAnalysis~~                                                                                                                                                                                                                            |
+| `dep`                                        | Syntactic dependency relation. ~~int~~                                                                                                                                                                                                                               |
+| `dep_`                                       | Syntactic dependency relation. ~~str~~                                                                                                                                                                                                                               |
+| `lang`                                       | Language of the parent document's vocabulary. ~~int~~                                                                                                                                                                                                                |
+| `lang_`                                      | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
+| `prob`                                       | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
+| `idx`                                        | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
+| `sentiment`                                  | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
+| `lex_id`                                     | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
+| `rank`                                       | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
+| `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
+| `_`                                          | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                                                                                                                                        |
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index e5b98da3a8c..c90172b4325 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1399,14 +1399,14 @@ patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
 ruler.add_patterns(patterns)
 
 doc1 = nlp("Apple is opening its first big office in San Francisco.")
-print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
+print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents])
 
 doc2 = nlp("Apple is opening its first big office in San Fran.")
-print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
+print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
 ```
 
 If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
-patterns, the `ent_id_` property of the matched entity is set to the `id` given
+patterns, the `id_` property of the matched entity is set to the `id` given
 in the patterns. So in the example above it's easy to identify that "San
 Francisco" and "San Fran" are both the same entity.
 

From 52b6937d6bee0f14fb50d2acb603efbd6b2ad7d0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 26 Aug 2022 10:11:18 +0200
Subject: [PATCH 123/504] Switch to mecab-ko as default Korean tokenizer
 (#11294)

* Switch to mecab-ko as default Korean tokenizer

Switch to the (confusingly-named) mecab-ko python module for default Korean
tokenization.

Maintain the previous `natto-py` tokenizer as
`spacy.KoreanNattoTokenizer.v1`.

* Temporarily run tests with mecab-ko tokenizer

* Fix types

* Fix duplicate test names

* Update requirements test

* Revert "Temporarily run tests with mecab-ko tokenizer"

This reverts commit d2083e7044403a2046f902b125a147525b703e29.

* Add mecab_args setting, fix pickle for KoreanNattoTokenizer

* Fix length check

* Update docs

* Formatting

* Update natto-py error message

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
---
 setup.cfg                                 |   2 +-
 spacy/lang/ko/__init__.py                 | 121 +++++++++++++++++-----
 spacy/tests/conftest.py                   |  16 ++-
 spacy/tests/lang/ko/test_lemmatization.py |   8 ++
 spacy/tests/lang/ko/test_serialize.py     |  20 ++++
 spacy/tests/lang/ko/test_tokenizer.py     |  42 +++++++-
 spacy/tests/package/test_requirements.py  |   2 +-
 website/docs/usage/models.mdx             |  35 ++++++-
 8 files changed, 212 insertions(+), 34 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index f9274cfae98..cf2b775bb4e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -122,7 +122,7 @@ ja =
     sudachipy>=0.5.2,!=0.6.1
     sudachidict_core>=20211220
 ko =
-    natto-py>=0.9.0
+    mecab-ko>=1.0.0
 th =
     pythainlp>=2.0
 
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index e2c860f7de9..81052cb24aa 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -17,34 +17,23 @@
 
 [nlp.tokenizer]
 @tokenizers = "spacy.ko.KoreanTokenizer"
+mecab_args = ""
 """
 
 
 @registry.tokenizers("spacy.ko.KoreanTokenizer")
-def create_tokenizer():
+def create_tokenizer(mecab_args: str):
     def korean_tokenizer_factory(nlp):
-        return KoreanTokenizer(nlp.vocab)
+        return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args)
 
     return korean_tokenizer_factory
 
 
 class KoreanTokenizer(DummyTokenizer):
-    def __init__(self, vocab: Vocab):
+    def __init__(self, vocab: Vocab, *, mecab_args: str = ""):
         self.vocab = vocab
-        self._mecab = try_mecab_import()  # type: ignore[func-returns-value]
-        self._mecab_tokenizer = None
-
-    @property
-    def mecab_tokenizer(self):
-        # This is a property so that initializing a pipeline with blank:ko is
-        # possible without actually requiring mecab-ko, e.g. to run
-        # `spacy init vectors ko` for a pipeline that will have a different
-        # tokenizer in the end. The languages need to match for the vectors
-        # to be imported and there's no way to pass a custom config to
-        # `init vectors`.
-        if self._mecab_tokenizer is None:
-            self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
-        return self._mecab_tokenizer
+        mecab = try_mecab_import()
+        self.mecab_tokenizer = mecab.Tagger(mecab_args)
 
     def __reduce__(self):
         return KoreanTokenizer, (self.vocab,)
@@ -67,13 +56,15 @@ def __call__(self, text: str) -> Doc:
     def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
         # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
         # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
-        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
-            if node.is_eos():
+        for line in self.mecab_tokenizer.parse(text).split("\n"):
+            if line == "EOS":
                 break
-            surface = node.surface
-            feature = node.feature
-            tag, _, expr = feature.partition(",")
-            lemma, _, remainder = expr.partition("/")
+            surface, _, expr = line.partition("\t")
+            features = expr.split("/")[0].split(",")
+            tag = features[0]
+            lemma = "*"
+            if len(features) >= 8:
+                lemma = features[7]
             if lemma == "*":
                 lemma = surface
             yield {"surface": surface, "lemma": lemma, "tag": tag}
@@ -96,20 +87,94 @@ class Korean(Language):
     Defaults = KoreanDefaults
 
 
-def try_mecab_import() -> None:
+def try_mecab_import():
     try:
-        from natto import MeCab
+        import mecab_ko as MeCab
 
         return MeCab
     except ImportError:
         raise ImportError(
             'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
-            "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
-            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
-            "and [natto-py](https://github.com/buruzaemon/natto-py)"
+            "the python package `mecab-ko`: pip install mecab-ko"
         ) from None
 
 
+@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
+def create_natto_tokenizer():
+    def korean_natto_tokenizer_factory(nlp):
+        return KoreanNattoTokenizer(nlp.vocab)
+
+    return korean_natto_tokenizer_factory
+
+
+class KoreanNattoTokenizer(DummyTokenizer):
+    def __init__(self, vocab: Vocab):
+        self.vocab = vocab
+        self._mecab = self._try_mecab_import()  # type: ignore[func-returns-value]
+        self._mecab_tokenizer = None
+
+    @property
+    def mecab_tokenizer(self):
+        # This is a property so that initializing a pipeline with blank:ko is
+        # possible without actually requiring mecab-ko, e.g. to run
+        # `spacy init vectors ko` for a pipeline that will have a different
+        # tokenizer in the end. The languages need to match for the vectors
+        # to be imported and there's no way to pass a custom config to
+        # `init vectors`.
+        if self._mecab_tokenizer is None:
+            self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
+        return self._mecab_tokenizer
+
+    def __reduce__(self):
+        return KoreanNattoTokenizer, (self.vocab,)
+
+    def __call__(self, text: str) -> Doc:
+        dtokens = list(self.detailed_tokens(text))
+        surfaces = [dt["surface"] for dt in dtokens]
+        doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
+        for token, dtoken in zip(doc, dtokens):
+            first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
+            token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
+            if token.tag_ in TAG_MAP:
+                token.pos = TAG_MAP[token.tag_][POS]
+            else:
+                token.pos = X
+            token.lemma_ = dtoken["lemma"]
+        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
+        return doc
+
+    def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
+        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],      종성 유무(jongseong)[2], 읽기(reading)[3],
+        # 타입(type)[4], 첫번째 품사(start pos)[5],     마지막 품사(end pos)[6], 표현(expression)[7], *
+        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
+            if node.is_eos():
+                break
+            surface = node.surface
+            feature = node.feature
+            tag, _, expr = feature.partition(",")
+            lemma, _, remainder = expr.partition("/")
+            if lemma == "*" or lemma == "":
+                lemma = surface
+            yield {"surface": surface, "lemma": lemma, "tag": tag}
+
+    def score(self, examples):
+        validate_examples(examples, "KoreanTokenizer.score")
+        return Scorer.score_tokenization(examples)
+
+    def _try_mecab_import(self):
+        try:
+            from natto import MeCab
+
+            return MeCab
+        except ImportError:
+            raise ImportError(
+                'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires '
+                "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
+                "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
+                "and [natto-py](https://github.com/buruzaemon/natto-py)"
+            ) from None
+
+
 def check_spaces(text, tokens):
     prev_end = -1
     start = 0
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 7db986ab9e7..2a9f441c9b0 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -245,7 +245,7 @@ def hsb_tokenizer():
 
 @pytest.fixture(scope="session")
 def ko_tokenizer():
-    pytest.importorskip("natto")
+    pytest.importorskip("mecab_ko")
     return get_lang_class("ko")().tokenizer
 
 
@@ -267,6 +267,20 @@ def la_tokenizer():
     return get_lang_class("la")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def ko_tokenizer_natto():
+    pytest.importorskip("natto")
+    config = {
+        "nlp": {
+            "tokenizer": {
+                "@tokenizers": "spacy.KoreanNattoTokenizer.v1",
+            }
+        }
+    }
+    nlp = get_lang_class("ko").from_config(config)
+    return nlp.tokenizer
+
+
 @pytest.fixture(scope="session")
 def lb_tokenizer():
     return get_lang_class("lb")().tokenizer
diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py
index 7782ca4bcab..0c389b9ce52 100644
--- a/spacy/tests/lang/ko/test_lemmatization.py
+++ b/spacy/tests/lang/ko/test_lemmatization.py
@@ -7,3 +7,11 @@
 def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
     test_lemma = ko_tokenizer(word)[0].lemma_
     assert test_lemma == lemma
+
+
+@pytest.mark.parametrize(
+    "word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
+)
+def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
+    test_lemma = ko_tokenizer_natto(word)[0].lemma_
+    assert test_lemma == lemma
diff --git a/spacy/tests/lang/ko/test_serialize.py b/spacy/tests/lang/ko/test_serialize.py
index bba7bce6e05..eecc7d955ba 100644
--- a/spacy/tests/lang/ko/test_serialize.py
+++ b/spacy/tests/lang/ko/test_serialize.py
@@ -23,3 +23,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer):
     b = pickle.dumps(ko_tokenizer)
     ko_tokenizer_re = pickle.loads(b)
     assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
+
+
+def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
+    tokenizer_bytes = ko_tokenizer_natto.to_bytes()
+    nlp = Korean()
+    nlp.tokenizer.from_bytes(tokenizer_bytes)
+    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+    with make_tempdir() as d:
+        file_path = d / "tokenizer"
+        ko_tokenizer_natto.to_disk(file_path)
+        nlp = Korean()
+        nlp.tokenizer.from_disk(file_path)
+        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+
+def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
+    b = pickle.dumps(ko_tokenizer_natto)
+    ko_tokenizer_natto_re = pickle.loads(b)
+    assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()
diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py
index 6e06e405e0b..e7f8a5c0d79 100644
--- a/spacy/tests/lang/ko/test_tokenizer.py
+++ b/spacy/tests/lang/ko/test_tokenizer.py
@@ -19,6 +19,8 @@
               "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
 # fmt: on
 
+# tests for ko_tokenizer (default KoreanTokenizer)
+
 
 @pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
 def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
@@ -44,7 +46,7 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
     assert pos == expected_pos.split()
 
 
-def test_ko_empty_doc(ko_tokenizer):
+def test_ko_tokenizer_empty_doc(ko_tokenizer):
     tokens = ko_tokenizer("")
     assert len(tokens) == 0
 
@@ -55,6 +57,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer):
     assert tokens[1].pos_ == "X"
 
 
+# same tests for ko_tokenizer_natto (KoreanNattoTokenizer)
+
+
+@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
+def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens):
+    tokens = [token.text for token in ko_tokenizer_natto(text)]
+    assert tokens == expected_tokens.split()
+
+
+@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
+def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags):
+    tags = [token.tag_ for token in ko_tokenizer_natto(text)]
+    assert tags == expected_tags.split()
+
+
+@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
+def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags):
+    tags = ko_tokenizer_natto(text).user_data["full_tags"]
+    assert tags == expected_tags.split()
+
+
+@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
+def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos):
+    pos = [token.pos_ for token in ko_tokenizer_natto(text)]
+    assert pos == expected_pos.split()
+
+
+def test_ko_tokenizer_natto_empty_doc(ko_tokenizer_natto):
+    tokens = ko_tokenizer_natto("")
+    assert len(tokens) == 0
+
+
+@pytest.mark.issue(10535)
+def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto):
+    tokens = ko_tokenizer_natto("미닛 리피터")
+    assert tokens[1].pos_ == "X"
+
+
 # fmt: off
 SPACY_TOKENIZER_TESTS = [
     ("있다.", "있다 ."),
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index ff07c5b454a..704d4b90b44 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -25,7 +25,7 @@ def test_build_dependencies():
     libs_ignore_setup = [
         "numpy",
         "fugashi",
-        "natto-py",
+        "mecab-ko",
         "pythainlp",
         "sudachipy",
         "sudachidict_core",
diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx
index 7fed9f40765..9213dead16b 100644
--- a/website/docs/usage/models.mdx
+++ b/website/docs/usage/models.mdx
@@ -264,18 +264,49 @@ used for training the current [Japanese pipelines](/models/ja).
 
 ### Korean language support {id="korean"}
 
-> #### mecab-ko tokenizer
+There are currently three built-in options for Korean tokenization, two based on
+[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) and one
+using the rule-based tokenizer.
+
+> #### Default mecab-ko tokenizer
 >
 > ```python
+> # uses mecab-ko-dic
 > nlp = spacy.blank("ko")
+>
+> # with custom mecab args
+> mecab_args = "-d /path/to/dicdir -u /path/to/userdic"
+> config = {"nlp": {"tokenizer": {"mecab_args": mecab_args}}}
+> nlp = spacy.blank("ko", config=config)
 > ```
 
-The default MeCab-based Korean tokenizer requires:
+The default MeCab-based Korean tokenizer requires the python package
+[`mecab-ko`](https://pypi.org/project/mecab-ko/) and no further system
+requirements.
+
+The `natto-py` MeCab-based tokenizer (the previous default for spaCy v3.4 and
+earlier) is available as `spacy.KoreanNattoTokenizer.v1`. It requires:
 
 - [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
 - [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
 - [natto-py](https://github.com/buruzaemon/natto-py)
 
+To use this tokenizer, edit `[nlp.tokenizer]` in your config:
+
+> #### natto-py MeCab-ko tokenizer
+>
+> ```python
+> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.KoreanNattoTokenizer.v1"}}}
+> nlp = spacy.blank("ko", config=config)
+> ```
+
+```ini
+### config.cfg
+[nlp]
+lang = "ko"
+tokenizer = {"@tokenizers" = "spacy.KoreanNattoTokenizer.v1"}
+```
+
 For some Korean datasets and tasks, the
 [rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
 than MeCab. To configure a Korean pipeline with the rule-based tokenizer:

From 7a23e5803dcfce18bfe848151b075ce542e03a5a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 29 Aug 2022 13:23:24 +0200
Subject: [PATCH 124/504] Remove setup_requires from setup.cfg (#11384)

* Remove setup_requires from setup.cfg

* Update requirements test to ignore cython in setup.cfg
---
 setup.cfg                                | 13 +------------
 spacy/tests/package/test_requirements.py |  2 +-
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index cf2b775bb4e..935ac7d0ee4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,18 +30,7 @@ project_urls =
 [options]
 zip_safe = false
 include_package_data = true
-python_requires = >=3.7
-# NOTE: This section is superseded by pyproject.toml and will be removed in
-# spaCy v4
-setup_requires =
-    cython>=0.25,<3.0
-    numpy>=1.15.0; python_version < "3.9"
-    numpy>=1.19.0; python_version >= "3.9"
-    # We also need our Cython packages here to compile against
-    cymem>=2.0.2,<2.1.0
-    preshed>=3.0.2,<3.1.0
-    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.2.2,<8.3.0
+python_requires = >=3.6
 install_requires =
     # Our libraries
     spacy-legacy>=3.0.11,<3.1.0
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index 704d4b90b44..a63b1d8b060 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -5,7 +5,7 @@
 def test_build_dependencies():
     # Check that library requirements are pinned exactly the same across different setup files.
     libs_ignore_requirements = [
-        "numpy",
+        "cython",
         "pytest",
         "pytest-timeout",
         "mock",

From fe2c879fb389c68b16d9745800347933ba1e086b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 30 Aug 2022 13:56:35 +0200
Subject: [PATCH 125/504] Make stable private modules public and adjust names
 (#11353)

* Make stable private modules public and adjust names

* `spacy.ml._character_embed` -> `spacy.ml.character_embed`
* `spacy.ml._precomputable_affine` -> `spacy.ml.precomputable_affine`
* `spacy.tokens._serialize` -> `spacy.tokens.doc_bin`
* `spacy.tokens._retokenize` -> `spacy.tokens.retokenize`
* `spacy.tokens._dict_proxies` -> `spacy.tokens.span_groups`

* Skip _precomputable_affine

* retokenize -> retokenizer

* Fix imports
---
 setup.py                                           |  2 +-
 .../ml/{_character_embed.py => character_embed.py} |  0
 spacy/ml/models/tok2vec.py                         |  6 ++++--
 spacy/pipeline/attribute_ruler.py                  |  4 ++--
 spacy/tests/pipeline/test_models.py                |  2 +-
 spacy/tests/pipeline/test_spancat.py               |  2 +-
 .../tests/serialize/test_serialize_span_groups.py  |  2 +-
 spacy/tokens/__init__.py                           |  3 ++-
 spacy/tokens/doc.pyi                               |  5 ++++-
 spacy/tokens/doc.pyx                               | 14 ++++++++++++++
 spacy/tokens/{_serialize.py => doc_bin.py}         | 11 ++++++-----
 spacy/tokens/{_retokenize.pyi => retokenizer.pyi}  |  0
 spacy/tokens/{_retokenize.pyx => retokenizer.pyx}  |  0
 spacy/tokens/{_dict_proxies.py => span_groups.py}  |  0
 14 files changed, 36 insertions(+), 15 deletions(-)
 rename spacy/ml/{_character_embed.py => character_embed.py} (100%)
 rename spacy/tokens/{_serialize.py => doc_bin.py} (97%)
 rename spacy/tokens/{_retokenize.pyi => retokenizer.pyi} (100%)
 rename spacy/tokens/{_retokenize.pyx => retokenizer.pyx} (100%)
 rename spacy/tokens/{_dict_proxies.py => span_groups.py} (100%)

diff --git a/setup.py b/setup.py
index 33178662df4..c9b4f7171e3 100755
--- a/setup.py
+++ b/setup.py
@@ -61,7 +61,7 @@
     "spacy.tokens.span_group",
     "spacy.tokens.graph",
     "spacy.tokens.morphanalysis",
-    "spacy.tokens._retokenize",
+    "spacy.tokens.retokenizer",
     "spacy.matcher.matcher",
     "spacy.matcher.phrasematcher",
     "spacy.matcher.dependencymatcher",
diff --git a/spacy/ml/_character_embed.py b/spacy/ml/character_embed.py
similarity index 100%
rename from spacy/ml/_character_embed.py
rename to spacy/ml/character_embed.py
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 0edc8999114..a605d32cd40 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -21,7 +21,9 @@
 
 from ...attrs import intify_attr
 from ...errors import Errors
-from ...ml import _character_embed
+from ...ml import character_embed
+from ..staticvectors import StaticVectors
+from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
 from ...tokens import Doc
 from ...util import registry
@@ -241,7 +243,7 @@ def CharacterEmbed(
     if feature is None:
         raise ValueError(Errors.E911.format(feat=feature))
     char_embed = chain(
-        _character_embed.CharacterEmbed(nM=nM, nC=nC),
+        character_embed.CharacterEmbed(nM=nM, nC=nC),
         cast(Model[List[Floats2d], Ragged], list2ragged()),
     )
     feature_extractor: Model[List[Doc], Ragged] = chain(
diff --git a/spacy/pipeline/attribute_ruler.py b/spacy/pipeline/attribute_ruler.py
index 8ac74d92bcd..126a48945bc 100644
--- a/spacy/pipeline/attribute_ruler.py
+++ b/spacy/pipeline/attribute_ruler.py
@@ -10,8 +10,8 @@
 from ..scorer import Scorer
 from ..symbols import IDS
 from ..tokens import Doc, Span
-from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
-from ..training import Example
+from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs
+from ..vocab import Vocab
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
 from .pipe import Pipe
diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py
index fef0017a8e1..4c0d352aa7f 100644
--- a/spacy/tests/pipeline/test_models.py
+++ b/spacy/tests/pipeline/test_models.py
@@ -8,7 +8,7 @@
 
 from spacy.lang.en import English
 from spacy.ml import FeatureExtractor, StaticVectors
-from spacy.ml._character_embed import CharacterEmbed
+from spacy.ml.character_embed import CharacterEmbed
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 9405a78e040..c143d193fa6 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -7,7 +7,7 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.tokens import SpanGroup
-from spacy.tokens._dict_proxies import SpanGroups
+from spacy.tokens.span_groups import SpanGroups
 from spacy.training import Example
 from spacy.util import fix_random_seed, make_tempdir, registry
 
diff --git a/spacy/tests/serialize/test_serialize_span_groups.py b/spacy/tests/serialize/test_serialize_span_groups.py
index 85313fcdcc3..c1c910fa137 100644
--- a/spacy/tests/serialize/test_serialize_span_groups.py
+++ b/spacy/tests/serialize/test_serialize_span_groups.py
@@ -1,7 +1,7 @@
 import pytest
 
 from spacy.tokens import Span, SpanGroup
-from spacy.tokens._dict_proxies import SpanGroups
+from spacy.tokens.span_groups import SpanGroups
 
 
 @pytest.mark.issue(10685)
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index 3393ca6eca9..e5a244360e3 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -3,6 +3,7 @@
 from .morphanalysis import MorphAnalysis
 from .span import Span
 from .span_group import SpanGroup
-from .token import Token
+from .doc_bin import DocBin
+from .morphanalysis import MorphAnalysis
 
 __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index f0b68862c32..0fae118b4b6 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -17,7 +17,10 @@ from typing import (
 import numpy as np
 from cymem.cymem import Pool
 from thinc.types import Floats1d, Floats2d, Ints2d
-
+from .span import Span
+from .token import Token
+from .span_groups import SpanGroups
+from .retokenizer import Retokenizer
 from ..lexeme import Lexeme
 from ..vocab import Vocab
 from ._dict_proxies import SpanGroups
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 50fc6e536c2..cee2eda6c53 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -20,6 +20,13 @@ from thinc.util import copy_array
 
 from .span cimport Span
 from .token cimport MISSING_DEP
+from .span_groups import SpanGroups
+from .token cimport Token
+from ..lexeme cimport Lexeme, EMPTY_LEXEME
+from ..typedefs cimport attr_t, flags_t
+from ..attrs cimport attr_id_t
+from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
+from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 
 from ._dict_proxies import SpanGroups
 
@@ -50,6 +57,13 @@ from .. import parts_of_speech, schemas, util
 from ..attrs import IDS, intify_attr
 from ..compat import copy_reg
 from ..errors import Errors, Warnings
+from ..morphology import Morphology
+from .. import util
+from .. import parts_of_speech
+from .. import schemas
+from .underscore import Underscore, get_ext_args
+from .retokenizer import Retokenizer
+from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from ..util import get_words_and_spaces
 from ._retokenize import Retokenizer
 from .underscore import Underscore, get_ext_args
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/doc_bin.py
similarity index 97%
rename from spacy/tokens/_serialize.py
rename to spacy/tokens/doc_bin.py
index 873d85835f0..8a08864d46e 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/doc_bin.py
@@ -10,11 +10,12 @@
 from ..attrs import IDS, ORTH, SPACY, intify_attr
 from ..compat import copy_reg
 from ..errors import Errors
-from ..util import SimpleFrozenList, ensure_path
-from ..vocab import Vocab
-from ._dict_proxies import SpanGroups
-from .doc import DOCBIN_ALL_ATTRS as ALL_ATTRS
-from .doc import Doc
+from ..util import ensure_path, SimpleFrozenList
+from .span_groups import SpanGroups
+
+# fmt: off
+ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
+# fmt: on
 
 
 class DocBin:
diff --git a/spacy/tokens/_retokenize.pyi b/spacy/tokens/retokenizer.pyi
similarity index 100%
rename from spacy/tokens/_retokenize.pyi
rename to spacy/tokens/retokenizer.pyi
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/retokenizer.pyx
similarity index 100%
rename from spacy/tokens/_retokenize.pyx
rename to spacy/tokens/retokenizer.pyx
diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/span_groups.py
similarity index 100%
rename from spacy/tokens/_dict_proxies.py
rename to spacy/tokens/span_groups.py

From 4b12c18c2bb1dabcb5f3000485141b75a4ffd192 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 30 Aug 2022 22:40:31 +0900
Subject: [PATCH 126/504] Update/remove old Matcher syntax (#11370)

* Clean up old Matcher call style related stuff

In v2 Matcher.add was called with (key, on_match, *patterns). In v3 this
was changed to (key, patterns, *, on_match=None), but there were various
points where the old call syntax was documented or handled specially.
This removes all those.

The Matcher itself didn't need any code changes, as it just gives a
generic type error. However the PhraseMatcher required some changes
because it would automatically "fix" the old call style.

Surprisingly, the tokenizer was still using the old call style in one
place.

After these changes tests failed in two places:

1. one test for the "new" call style, including the "old" call style. I
   removed this test.
2. deserializing the PhraseMatcher fails because the input docs are a
   set.

I am not sure why 2 is happening - I guess it's a quirk of the
serialization format? - so for now I just convert the set to a list when
deserializing. The check that the input Docs are a List in the
PhraseMatcher is a new check, but makes it parallel with the other
Matchers, which seemed like the right thing to do.

* Add notes related to input docs / deserialization type

* Remove Typing import

* Remove old note about call style change

* Apply suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Use separate method for setting internal doc representations

In addition to the title change, this changes the internal dict to be a
defaultdict, instead of a dict with frequent use of setdefault.

* Add _add_from_arrays for unpickling

* Cleanup around adding from arrays

This moves adding to internal structures into the private batch method,
and removes the single-add method.

This has one behavioral change for `add`, in that if something is wrong
with the list of input Docs (such as one of the items not being a Doc),
valid items before the invalid one will not be added. Also the callback
will not be updated if anything is invalid. This change should not be
significant.

This also adds a test to check failure when given a non-Doc.

* Update spacy/matcher/phrasematcher.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/errors.py                            |   7 +-
 spacy/matcher/dependencymatcher.pyx        |   6 +-
 spacy/matcher/matcher.pyx                  |   6 +-
 spacy/matcher/phrasematcher.pyi            |   9 ++
 spacy/matcher/phrasematcher.pyx            | 118 ++++++++++++---------
 spacy/tests/matcher/test_phrase_matcher.py |  29 ++---
 spacy/tokenizer.pyx                        |   2 +-
 website/docs/api/matcher.mdx               |  14 ---
 website/docs/api/phrasematcher.mdx         |  22 +---
 9 files changed, 97 insertions(+), 116 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index cf9a7b7087a..146c60b6d60 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -500,7 +500,7 @@ class Errors(metaclass=ErrorsWithCodes):
             "Current DocBin: {current}\nOther DocBin: {other}")
     E169 = ("Can't find module: {module}")
     E170 = ("Cannot apply transition {name}: invalid for the current state.")
-    E171 = ("Matcher.add received invalid 'on_match' callback argument: expected "
+    E171 = ("{name}.add received invalid 'on_match' callback argument: expected "
             "callable or None, but got: {arg_type}")
     E175 = ("Can't remove rule for unknown match pattern ID: {key}")
     E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
@@ -759,7 +759,7 @@ class Errors(metaclass=ErrorsWithCodes):
             "loaded nlp object, but got: {source}")
     E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
             "a string value from {expected} but got: '{arg}'")
-    E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
+    E948 = ("`{name}.add` received invalid 'patterns' argument: expected "
             "a list, but got: {arg_type}")
     E949 = ("Unable to align tokens for the predicted and reference docs. It "
             "is only possible to align the docs when both texts are the same "
@@ -989,6 +989,9 @@ class Errors(metaclass=ErrorsWithCodes):
              "reduction. Please enable one of `use_reduce_first`, "
              "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
+    # v4 error strings
+    E4000 = ("Expected a Doc as input, but got: '{type}'")
+
 
 # Deprecated model shortcuts, only used in errors and warnings
 OLD_MODEL_SHORTCUTS = {
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index ab5f5d5d14b..0b639ab04fb 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -175,9 +175,9 @@ cdef class DependencyMatcher:
         on_match (callable): Optional callback executed on match.
         """
         if on_match is not None and not hasattr(on_match, "__call__"):
-            raise ValueError(Errors.E171.format(arg_type=type(on_match)))
-        if patterns is None or not isinstance(patterns, List):  # old API
-            raise ValueError(Errors.E948.format(arg_type=type(patterns)))
+            raise ValueError(Errors.E171.format(name="DependencyMatcher", arg_type=type(on_match)))
+        if patterns is None or not isinstance(patterns, List):
+            raise ValueError(Errors.E948.format(name="DependencyMatcher", arg_type=type(patterns)))
         for pattern in patterns:
             if len(pattern) == 0:
                 raise ValueError(Errors.E012.format(key=key))
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index f0116169a6b..715dd45f07c 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -113,9 +113,9 @@ cdef class Matcher:
         """
         errors = {}
         if on_match is not None and not hasattr(on_match, "__call__"):
-            raise ValueError(Errors.E171.format(arg_type=type(on_match)))
-        if patterns is None or not isinstance(patterns, List):  # old API
-            raise ValueError(Errors.E948.format(arg_type=type(patterns)))
+            raise ValueError(Errors.E171.format(name="Matcher", arg_type=type(on_match)))
+        if patterns is None or not isinstance(patterns, List):
+            raise ValueError(Errors.E948.format(name="Matcher", arg_type=type(patterns)))
         if greedy is not None and greedy not in ["FIRST", "LONGEST"]:
             raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=greedy))
         for i, pattern in enumerate(patterns):
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index 27f6ba373fc..f9585da7893 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -21,6 +21,15 @@ class PhraseMatcher:
             Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
         ] = ...,
     ) -> None: ...
+    def _add_from_arrays(
+        self,
+        key: str,
+        specs: List[List[int]],
+        *,
+        on_match: Optional[
+            Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
+        ] = ...,
+    ) -> None: ...
     def remove(self, key: str) -> None: ...
     @overload
     def __call__(
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 4efcdb05c43..6e3c52924fa 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,5 +1,8 @@
-# cython: infer_types=True
-from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
+# cython: infer_types=True, profile=True
+from typing import List
+from collections import defaultdict
+from libc.stdint cimport uintptr_t
+from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
 
 import warnings
 
@@ -39,7 +42,7 @@ cdef class PhraseMatcher:
         """
         self.vocab = vocab
         self._callbacks = {}
-        self._docs = {}
+        self._docs = defaultdict(set)
         self._validate = validate
 
         self.mem = Pool()
@@ -155,66 +158,24 @@ cdef class PhraseMatcher:
         del self._callbacks[key]
         del self._docs[key]
 
-    def add(self, key, docs, *_docs, on_match=None):
-        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
-        key, an on_match callback, and one or more patterns.
 
-        Since spaCy v2.2.2, PhraseMatcher.add takes a list of patterns as the
-        second argument, with the on_match callback as an optional keyword
-        argument.
+    def _add_from_arrays(self, key, specs, *, on_match=None):
+        """Add a preprocessed list of specs, with an optional callback.
 
         key (str): The match ID.
-        docs (list): List of `Doc` objects representing match patterns.
+        specs (List[List[int]]): A list of lists of hashes to match.
         on_match (callable): Callback executed on match.
-        *_docs (Doc): For backwards compatibility: list of patterns to add
-            as variable arguments. Will be ignored if a list of patterns is
-            provided as the second argument.
-
-        DOCS: https://spacy.io/api/phrasematcher#add
         """
-        if docs is None or hasattr(docs, "__call__"):  # old API
-            on_match = docs
-            docs = _docs
-
-        _ = self.vocab[key]
-        self._callbacks[key] = on_match
-        self._docs.setdefault(key, set())
-
         cdef MapStruct* current_node
         cdef MapStruct* internal_node
         cdef void* result
 
-        if isinstance(docs, Doc):
-            raise ValueError(Errors.E179.format(key=key))
-        for doc in docs:
-            if len(doc) == 0:
-                continue
-            if isinstance(doc, Doc):
-                attrs = (TAG, POS, MORPH, LEMMA, DEP)
-                has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
-                for attr in attrs:
-                    if self.attr == attr and not has_annotation[attr]:
-                        if attr == TAG:
-                            pipe = "tagger"
-                        elif attr in (POS, MORPH):
-                            pipe = "morphologizer or tagger+attribute_ruler"
-                        elif attr == LEMMA:
-                            pipe = "lemmatizer"
-                        elif attr == DEP:
-                            pipe = "parser"
-                        error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
-                        raise ValueError(error_msg)
-                if self._validate and any(has_annotation.values()) \
-                        and self.attr not in attrs:
-                    string_attr = self.vocab.strings[self.attr]
-                    warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
-                keyword = self._convert_to_array(doc)
-            else:
-                keyword = doc
-            self._docs[key].add(tuple(keyword))
+        self._callbacks[key] = on_match
+        for spec in specs:
+            self._docs[key].add(tuple(spec))
 
             current_node = self.c_map
-            for token in keyword:
+            for token in spec:
                 if token == self._terminal_hash:
                     warnings.warn(Warnings.W021)
                     break
@@ -233,6 +194,57 @@ cdef class PhraseMatcher:
                 result = internal_node
             map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
 
+
+    def add(self, key, docs, *, on_match=None):
+        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
+        key, a list of one or more patterns, and (optionally) an on_match callback.
+
+        key (str): The match ID.
+        docs (list): List of `Doc` objects representing match patterns.
+        on_match (callable): Callback executed on match.
+
+        If any of the input Docs are invalid, no internal state will be updated.
+
+        DOCS: https://spacy.io/api/phrasematcher#add
+        """
+        if isinstance(docs, Doc):
+            raise ValueError(Errors.E179.format(key=key))
+        if docs is None or not isinstance(docs, List):
+            raise ValueError(Errors.E948.format(name="PhraseMatcher", arg_type=type(docs)))
+        if on_match is not None and not hasattr(on_match, "__call__"):
+            raise ValueError(Errors.E171.format(name="PhraseMatcher", arg_type=type(on_match)))
+
+        _ = self.vocab[key]
+        specs = []
+
+        for doc in docs:
+            if len(doc) == 0:
+                continue
+            if not isinstance(doc, Doc):
+                raise ValueError(Errors.E4000.format(type=type(doc)))
+
+            attrs = (TAG, POS, MORPH, LEMMA, DEP)
+            has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
+            for attr in attrs:
+                if self.attr == attr and not has_annotation[attr]:
+                    if attr == TAG:
+                        pipe = "tagger"
+                    elif attr in (POS, MORPH):
+                        pipe = "morphologizer or tagger+attribute_ruler"
+                    elif attr == LEMMA:
+                        pipe = "lemmatizer"
+                    elif attr == DEP:
+                        pipe = "parser"
+                    error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+                    raise ValueError(error_msg)
+            if self._validate and any(has_annotation.values()) \
+                    and self.attr not in attrs:
+                string_attr = self.vocab.strings[self.attr]
+                warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
+            specs.append(self._convert_to_array(doc))
+
+        self._add_from_arrays(key, specs, on_match=on_match)
+
     def __call__(self, object doclike, *, as_spans=False):
         """Find all sequences matching the supplied patterns on the `Doc`.
 
@@ -345,7 +357,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr):
     matcher = PhraseMatcher(vocab, attr=attr)
     for key, specs in docs.items():
         callback = callbacks.get(key, None)
-        matcher.add(key, specs, on_match=callback)
+        matcher._add_from_arrays(key, specs, on_match=callback)
     return matcher
 
 
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 7335bbdf107..4ad234cba3b 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -198,28 +198,6 @@ def test_phrase_matcher_contains(en_vocab):
     assert "TEST2" not in matcher
 
 
-def test_phrase_matcher_add_new_api(en_vocab):
-    doc = Doc(en_vocab, words=["a", "b"])
-    patterns = [Doc(en_vocab, words=["a"]), Doc(en_vocab, words=["a", "b"])]
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("OLD_API", None, *patterns)
-    assert len(matcher(doc)) == 2
-    matcher = PhraseMatcher(en_vocab)
-    on_match = Mock()
-    matcher.add("OLD_API_CALLBACK", on_match, *patterns)
-    assert len(matcher(doc)) == 2
-    assert on_match.call_count == 2
-    # New API: add(key: str, patterns: List[List[dict]], on_match: Callable)
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("NEW_API", patterns)
-    assert len(matcher(doc)) == 2
-    matcher = PhraseMatcher(en_vocab)
-    on_match = Mock()
-    matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match)
-    assert len(matcher(doc)) == 2
-    assert on_match.call_count == 2
-
-
 def test_phrase_matcher_repeated_add(en_vocab):
     matcher = PhraseMatcher(en_vocab)
     # match ID only gets added once
@@ -468,6 +446,13 @@ def test_phrase_matcher_deprecated(en_vocab):
         assert "spaCy v3.0" in str(record.list[0].message)
 
 
+def test_phrase_matcher_non_doc(en_vocab):
+    matcher = PhraseMatcher(en_vocab)
+    doc = Doc(en_vocab, words=["hello", "world"])
+    with pytest.raises(ValueError):
+        matcher.add("TEST", [doc, "junk"])
+
+
 @pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"])
 def test_phrase_matcher_sent_start(en_vocab, attr):
     _ = PhraseMatcher(en_vocab, attr=attr)  # noqa: F841
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 9b79207f82e..cdb7dda7094 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -619,7 +619,7 @@ cdef class Tokenizer:
         self._rules[string] = substrings
         self._flush_cache()
         if not self.faster_heuristics or self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
-            self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
+            self._special_matcher.add(string, [self._tokenize_affixes(string, False)])
 
     def _reload_special_cases(self):
         self._flush_cache()
diff --git a/website/docs/api/matcher.mdx b/website/docs/api/matcher.mdx
index c66579da814..66954b6c4fb 100644
--- a/website/docs/api/matcher.mdx
+++ b/website/docs/api/matcher.mdx
@@ -211,20 +211,6 @@ will be overwritten.
 > matches = matcher(doc)
 > ```
 
-<Infobox title="Changed in v3.0" variant="warning">
-
-As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument
-(instead of a variable number of arguments). The `on_match` callback becomes an
-optional keyword argument.
-
-```diff
-patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
-- matcher.add("GoogleNow", on_match, *patterns)
-+ matcher.add("GoogleNow", patterns, on_match=on_match)
-```
-
-</Infobox>
-
 | Name                                | Description                                                                                                                                                |
 | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `match_id`                          | An ID for the thing you're matching. ~~str~~                                                                                                               |
diff --git a/website/docs/api/phrasematcher.mdx b/website/docs/api/phrasematcher.mdx
index 14ccefb772e..2c5e767dcba 100644
--- a/website/docs/api/phrasematcher.mdx
+++ b/website/docs/api/phrasematcher.mdx
@@ -116,10 +116,10 @@ Check whether the matcher contains rules for a match ID.
 ## PhraseMatcher.add {id="add",tag="method"}
 
 Add a rule to the matcher, consisting of an ID key, one or more patterns, and a
-callback function to act on the matches. The callback function will receive the
-arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for
-the given ID, the patterns will be extended. An `on_match` callback will be
-overwritten.
+optional callback function to act on the matches. The callback function will
+receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
+exists for the given ID, the patterns will be extended. An `on_match` callback
+will be overwritten.
 
 > #### Example
 >
@@ -134,20 +134,6 @@ overwritten.
 >   matches = matcher(doc)
 > ```
 
-<Infobox title="Changed in v3.0" variant="warning">
-
-As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second
-argument (instead of a variable number of arguments). The `on_match` callback
-becomes an optional keyword argument.
-
-```diff
-patterns = [nlp("health care reform"), nlp("healthcare reform")]
-- matcher.add("HEALTH", on_match, *patterns)
-+ matcher.add("HEALTH", patterns, on_match=on_match)
-```
-
-</Infobox>
-
 | Name           | Description                                                                                                                                                |
 | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `key`          | An ID for the thing you're matching. ~~str~~                                                                                                               |

From bab73e3e026c0d6be63fa98a6b46a4292f6560cc Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 2 Sep 2022 09:08:40 +0200
Subject: [PATCH 127/504] Consolidate and freeze symbols (#11352)

* Consolidate and freeze symbols

Instead of having symbol values defined in three potentially conflicting
places (`spacy.attrs`, `spacy.parts_of_speech`, `spacy.symbols`), define
all symbols in `spacy.symbols` and reference those values in
`spacy.attrs` and `spacy.parts_of_speech`.

Remove deprecated and placeholder symbols from `spacy.attrs.IDS`.

Make `spacy.attrs.NAMES` and `spacy.symbols.NAMES` reverse dicts rather
than lists in order to support future use of hash values in `attr_id_t`.

Minor changes:

* Use `uint64_t` for attrs in `Doc.to_array` to support future use of
hash values
* Remove unneeded attrs filter for error message in `Doc.to_array`
* Remove unused attr `SENT_END`

* Handle dynamic size of attr_id_t in Doc.to_array

* Undo added warnings

* Refactor to make Doc.to_array more similar to Doc.from_array

* Improve refactoring
---
 spacy/attrs.pxd             | 129 +++-------
 spacy/attrs.pyx             |  49 +---
 spacy/parts_of_speech.pxd   |  38 +--
 spacy/schemas.py            |   2 +-
 spacy/strings.pyx           |   4 +-
 spacy/symbols.pxd           |  15 +-
 spacy/symbols.pyx           |   6 +-
 spacy/tests/test_symbols.py | 467 ++++++++++++++++++++++++++++++++++++
 spacy/tokens/doc.pyx        |  20 +-
 9 files changed, 551 insertions(+), 179 deletions(-)
 create mode 100644 spacy/tests/test_symbols.py

diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index fbbac0ec29c..b8972cb714e 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -1,99 +1,50 @@
-# Reserve 64 values for flag features
 from . cimport symbols
 
 
 cdef enum attr_id_t:
-    NULL_ATTR
-    IS_ALPHA
-    IS_ASCII
-    IS_DIGIT
-    IS_LOWER
-    IS_PUNCT
-    IS_SPACE
-    IS_TITLE
-    IS_UPPER
-    LIKE_URL
-    LIKE_NUM
-    LIKE_EMAIL
-    IS_STOP
-    IS_OOV_DEPRECATED
-    IS_BRACKET
-    IS_QUOTE
-    IS_LEFT_PUNCT
-    IS_RIGHT_PUNCT
-    IS_CURRENCY
+    NULL_ATTR = 0
+    IS_ALPHA = symbols.IS_ALPHA
+    IS_ASCII = symbols.IS_ASCII
+    IS_DIGIT = symbols.IS_DIGIT
+    IS_LOWER = symbols.IS_LOWER
+    IS_PUNCT = symbols.IS_PUNCT
+    IS_SPACE = symbols.IS_SPACE
+    IS_TITLE = symbols.IS_TITLE
+    IS_UPPER = symbols.IS_UPPER
+    LIKE_URL = symbols.LIKE_URL
+    LIKE_NUM = symbols.LIKE_NUM
+    LIKE_EMAIL = symbols.LIKE_EMAIL
+    IS_STOP = symbols.IS_STOP
+    IS_BRACKET = symbols.IS_BRACKET
+    IS_QUOTE = symbols.IS_QUOTE
+    IS_LEFT_PUNCT = symbols.IS_LEFT_PUNCT
+    IS_RIGHT_PUNCT = symbols.IS_RIGHT_PUNCT
+    IS_CURRENCY = symbols.IS_CURRENCY
 
-    FLAG19 = 19
-    FLAG20
-    FLAG21
-    FLAG22
-    FLAG23
-    FLAG24
-    FLAG25
-    FLAG26
-    FLAG27
-    FLAG28
-    FLAG29
-    FLAG30
-    FLAG31
-    FLAG32
-    FLAG33
-    FLAG34
-    FLAG35
-    FLAG36
-    FLAG37
-    FLAG38
-    FLAG39
-    FLAG40
-    FLAG41
-    FLAG42
-    FLAG43
-    FLAG44
-    FLAG45
-    FLAG46
-    FLAG47
-    FLAG48
-    FLAG49
-    FLAG50
-    FLAG51
-    FLAG52
-    FLAG53
-    FLAG54
-    FLAG55
-    FLAG56
-    FLAG57
-    FLAG58
-    FLAG59
-    FLAG60
-    FLAG61
-    FLAG62
-    FLAG63
+    ID = symbols.ID
+    ORTH = symbols.ORTH
+    LOWER = symbols.LOWER
+    NORM = symbols.NORM
+    SHAPE = symbols.SHAPE
+    PREFIX = symbols.PREFIX
+    SUFFIX = symbols.SUFFIX
 
-    ID
-    ORTH
-    LOWER
-    NORM
-    SHAPE
-    PREFIX
-    SUFFIX
+    LENGTH = symbols.LENGTH
+    CLUSTER = symbols.CLUSTER
+    LEMMA = symbols.LEMMA
+    POS = symbols.POS
+    TAG = symbols.TAG
+    DEP = symbols.DEP
+    ENT_IOB = symbols.ENT_IOB
+    ENT_TYPE = symbols.ENT_TYPE
+    HEAD = symbols.HEAD
+    SENT_START = symbols.SENT_START
+    SPACY = symbols.SPACY
+    PROB = symbols.PROB
 
-    LENGTH
-    CLUSTER
-    LEMMA
-    POS
-    TAG
-    DEP
-    ENT_IOB
-    ENT_TYPE
-    HEAD
-    SENT_START
-    SPACY
-    PROB
-
-    LANG
+    LANG = symbols.LANG
     ENT_KB_ID = symbols.ENT_KB_ID
-    MORPH
+    MORPH = symbols.MORPH
     ENT_ID = symbols.ENT_ID
 
-    IDX
-    SENT_END
+    IDX = symbols.IDX
diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 0a4aecc5d85..1688afe47af 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -17,57 +17,11 @@ IDS = {
     "LIKE_NUM": LIKE_NUM,
     "LIKE_EMAIL": LIKE_EMAIL,
     "IS_STOP": IS_STOP,
-    "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
     "IS_BRACKET": IS_BRACKET,
     "IS_QUOTE": IS_QUOTE,
     "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
     "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
     "IS_CURRENCY": IS_CURRENCY,
-    "FLAG19": FLAG19,
-    "FLAG20": FLAG20,
-    "FLAG21": FLAG21,
-    "FLAG22": FLAG22,
-    "FLAG23": FLAG23,
-    "FLAG24": FLAG24,
-    "FLAG25": FLAG25,
-    "FLAG26": FLAG26,
-    "FLAG27": FLAG27,
-    "FLAG28": FLAG28,
-    "FLAG29": FLAG29,
-    "FLAG30": FLAG30,
-    "FLAG31": FLAG31,
-    "FLAG32": FLAG32,
-    "FLAG33": FLAG33,
-    "FLAG34": FLAG34,
-    "FLAG35": FLAG35,
-    "FLAG36": FLAG36,
-    "FLAG37": FLAG37,
-    "FLAG38": FLAG38,
-    "FLAG39": FLAG39,
-    "FLAG40": FLAG40,
-    "FLAG41": FLAG41,
-    "FLAG42": FLAG42,
-    "FLAG43": FLAG43,
-    "FLAG44": FLAG44,
-    "FLAG45": FLAG45,
-    "FLAG46": FLAG46,
-    "FLAG47": FLAG47,
-    "FLAG48": FLAG48,
-    "FLAG49": FLAG49,
-    "FLAG50": FLAG50,
-    "FLAG51": FLAG51,
-    "FLAG52": FLAG52,
-    "FLAG53": FLAG53,
-    "FLAG54": FLAG54,
-    "FLAG55": FLAG55,
-    "FLAG56": FLAG56,
-    "FLAG57": FLAG57,
-    "FLAG58": FLAG58,
-    "FLAG59": FLAG59,
-    "FLAG60": FLAG60,
-    "FLAG61": FLAG61,
-    "FLAG62": FLAG62,
-    "FLAG63": FLAG63,
     "ID": ID,
     "ORTH": ORTH,
     "LOWER": LOWER,
@@ -93,8 +47,7 @@ IDS = {
 }
 
 
-# ATTR IDs, in order of the symbol
-NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
+NAMES = {v: k for k, v in IDS.items()}
 locals().update(IDS)
 
 
diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index b5423d11301..01f116ea688 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -4,22 +4,22 @@ from . cimport symbols
 cpdef enum univ_pos_t:
     NO_TAG = 0
     ADJ = symbols.ADJ
-    ADP
-    ADV
-    AUX
-    CONJ
-    CCONJ  # U20
-    DET
-    INTJ
-    NOUN
-    NUM
-    PART
-    PRON
-    PROPN
-    PUNCT
-    SCONJ
-    SYM
-    VERB
-    X
-    EOL
-    SPACE
+    ADP = symbols.ADP
+    ADV = symbols.ADV
+    AUX = symbols.AUX
+    CONJ = symbols.CONJ
+    CCONJ  = symbols.CCONJ  # U20
+    DET = symbols.DET
+    INTJ = symbols.INTJ
+    NOUN = symbols.NOUN
+    NUM = symbols.NUM
+    PART = symbols.PART
+    PRON = symbols.PRON
+    PROPN = symbols.PROPN
+    PUNCT = symbols.PUNCT
+    SCONJ = symbols.SCONJ
+    SYM = symbols.SYM
+    VERB = symbols.VERB
+    X = symbols.X
+    EOL = symbols.EOL
+    SPACE = symbols.SPACE
diff --git a/spacy/schemas.py b/spacy/schemas.py
index fa987b90f19..9a2b5ed60e9 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -181,7 +181,7 @@ def validate_init_settings(
 
 def validate_token_pattern(obj: list) -> List[str]:
     # Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"})
-    get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k
+    get_key = lambda k: NAMES[k] if isinstance(k, int) and k in NAMES else k
     if isinstance(obj, list):
         converted = []
         for pattern in obj:
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 376a131751e..e73b66dff54 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -148,7 +148,7 @@ cdef class StringStore:
         elif _try_coerce_to_hash(string_or_id, &str_hash):
             if str_hash == 0:
                 return ""
-            elif str_hash < len(SYMBOLS_BY_INT):
+            elif str_hash in SYMBOLS_BY_INT:
                 return SYMBOLS_BY_INT[str_hash]
             else:
                 utf8str = <Utf8Str*>self._map.get(str_hash)
@@ -224,7 +224,7 @@ cdef class StringStore:
             # TODO: Raise an error instead
             return self._map.get(string_or_id) is not NULL
 
-        if str_hash < len(SYMBOLS_BY_INT):
+        if str_hash in SYMBOLS_BY_INT:
             return True
         else:
             return self._map.get(str_hash) is not NULL
diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index 73be19145b2..9e74bf67620 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -1,5 +1,6 @@
+# DO NOT EDIT! The symbols are frozen as of spaCy v3.0.0.
 cdef enum symbol_t:
-    NIL
+    NIL = 0
     IS_ALPHA
     IS_ASCII
     IS_DIGIT
@@ -65,7 +66,7 @@ cdef enum symbol_t:
     FLAG62
     FLAG63
 
-    ID
+    ID = 64
     ORTH
     LOWER
     NORM
@@ -385,7 +386,7 @@ cdef enum symbol_t:
     DEPRECATED275
     DEPRECATED276
 
-    PERSON
+    PERSON = 380
     NORP
     FACILITY
     ORG
@@ -405,7 +406,7 @@ cdef enum symbol_t:
     ORDINAL
     CARDINAL
 
-    acomp
+    acomp = 398
     advcl
     advmod
     agent
@@ -458,12 +459,12 @@ cdef enum symbol_t:
     rcmod
     root
     xcomp
-
     acl
 
-    ENT_KB_ID
+    ENT_KB_ID = 452
     MORPH
     ENT_ID
 
     IDX
-    _
+    _ = 456
+    # DO NOT ADD ANY NEW SYMBOLS!
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index f7713577bd3..d2a8a428954 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -470,11 +470,7 @@ IDS = {
 }
 
 
-def sort_nums(x):
-    return x[1]
-
-
-NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
+NAMES = {v: k for k, v in IDS.items()}
 # Unfortunate hack here, to work around problem with long cpdef enum
 # (which is generating an enormous amount of C++ in Cython 0.24+)
 # We keep the enum cdef, and just make sure the names are available to Python
diff --git a/spacy/tests/test_symbols.py b/spacy/tests/test_symbols.py
new file mode 100644
index 00000000000..fb034accac2
--- /dev/null
+++ b/spacy/tests/test_symbols.py
@@ -0,0 +1,467 @@
+import pytest
+from spacy.symbols import IDS, NAMES
+
+V3_SYMBOLS = {
+    "": 0,
+    "IS_ALPHA": 1,
+    "IS_ASCII": 2,
+    "IS_DIGIT": 3,
+    "IS_LOWER": 4,
+    "IS_PUNCT": 5,
+    "IS_SPACE": 6,
+    "IS_TITLE": 7,
+    "IS_UPPER": 8,
+    "LIKE_URL": 9,
+    "LIKE_NUM": 10,
+    "LIKE_EMAIL": 11,
+    "IS_STOP": 12,
+    "IS_OOV_DEPRECATED": 13,
+    "IS_BRACKET": 14,
+    "IS_QUOTE": 15,
+    "IS_LEFT_PUNCT": 16,
+    "IS_RIGHT_PUNCT": 17,
+    "IS_CURRENCY": 18,
+    "FLAG19": 19,
+    "FLAG20": 20,
+    "FLAG21": 21,
+    "FLAG22": 22,
+    "FLAG23": 23,
+    "FLAG24": 24,
+    "FLAG25": 25,
+    "FLAG26": 26,
+    "FLAG27": 27,
+    "FLAG28": 28,
+    "FLAG29": 29,
+    "FLAG30": 30,
+    "FLAG31": 31,
+    "FLAG32": 32,
+    "FLAG33": 33,
+    "FLAG34": 34,
+    "FLAG35": 35,
+    "FLAG36": 36,
+    "FLAG37": 37,
+    "FLAG38": 38,
+    "FLAG39": 39,
+    "FLAG40": 40,
+    "FLAG41": 41,
+    "FLAG42": 42,
+    "FLAG43": 43,
+    "FLAG44": 44,
+    "FLAG45": 45,
+    "FLAG46": 46,
+    "FLAG47": 47,
+    "FLAG48": 48,
+    "FLAG49": 49,
+    "FLAG50": 50,
+    "FLAG51": 51,
+    "FLAG52": 52,
+    "FLAG53": 53,
+    "FLAG54": 54,
+    "FLAG55": 55,
+    "FLAG56": 56,
+    "FLAG57": 57,
+    "FLAG58": 58,
+    "FLAG59": 59,
+    "FLAG60": 60,
+    "FLAG61": 61,
+    "FLAG62": 62,
+    "FLAG63": 63,
+    "ID": 64,
+    "ORTH": 65,
+    "LOWER": 66,
+    "NORM": 67,
+    "SHAPE": 68,
+    "PREFIX": 69,
+    "SUFFIX": 70,
+    "LENGTH": 71,
+    "CLUSTER": 72,
+    "LEMMA": 73,
+    "POS": 74,
+    "TAG": 75,
+    "DEP": 76,
+    "ENT_IOB": 77,
+    "ENT_TYPE": 78,
+    "ENT_ID": 454,
+    "ENT_KB_ID": 452,
+    "HEAD": 79,
+    "SENT_START": 80,
+    "SPACY": 81,
+    "PROB": 82,
+    "LANG": 83,
+    "IDX": 455,
+    "ADJ": 84,
+    "ADP": 85,
+    "ADV": 86,
+    "AUX": 87,
+    "CONJ": 88,
+    "CCONJ": 89,
+    "DET": 90,
+    "INTJ": 91,
+    "NOUN": 92,
+    "NUM": 93,
+    "PART": 94,
+    "PRON": 95,
+    "PROPN": 96,
+    "PUNCT": 97,
+    "SCONJ": 98,
+    "SYM": 99,
+    "VERB": 100,
+    "X": 101,
+    "EOL": 102,
+    "SPACE": 103,
+    "DEPRECATED001": 104,
+    "DEPRECATED002": 105,
+    "DEPRECATED003": 106,
+    "DEPRECATED004": 107,
+    "DEPRECATED005": 108,
+    "DEPRECATED006": 109,
+    "DEPRECATED007": 110,
+    "DEPRECATED008": 111,
+    "DEPRECATED009": 112,
+    "DEPRECATED010": 113,
+    "DEPRECATED011": 114,
+    "DEPRECATED012": 115,
+    "DEPRECATED013": 116,
+    "DEPRECATED014": 117,
+    "DEPRECATED015": 118,
+    "DEPRECATED016": 119,
+    "DEPRECATED017": 120,
+    "DEPRECATED018": 121,
+    "DEPRECATED019": 122,
+    "DEPRECATED020": 123,
+    "DEPRECATED021": 124,
+    "DEPRECATED022": 125,
+    "DEPRECATED023": 126,
+    "DEPRECATED024": 127,
+    "DEPRECATED025": 128,
+    "DEPRECATED026": 129,
+    "DEPRECATED027": 130,
+    "DEPRECATED028": 131,
+    "DEPRECATED029": 132,
+    "DEPRECATED030": 133,
+    "DEPRECATED031": 134,
+    "DEPRECATED032": 135,
+    "DEPRECATED033": 136,
+    "DEPRECATED034": 137,
+    "DEPRECATED035": 138,
+    "DEPRECATED036": 139,
+    "DEPRECATED037": 140,
+    "DEPRECATED038": 141,
+    "DEPRECATED039": 142,
+    "DEPRECATED040": 143,
+    "DEPRECATED041": 144,
+    "DEPRECATED042": 145,
+    "DEPRECATED043": 146,
+    "DEPRECATED044": 147,
+    "DEPRECATED045": 148,
+    "DEPRECATED046": 149,
+    "DEPRECATED047": 150,
+    "DEPRECATED048": 151,
+    "DEPRECATED049": 152,
+    "DEPRECATED050": 153,
+    "DEPRECATED051": 154,
+    "DEPRECATED052": 155,
+    "DEPRECATED053": 156,
+    "DEPRECATED054": 157,
+    "DEPRECATED055": 158,
+    "DEPRECATED056": 159,
+    "DEPRECATED057": 160,
+    "DEPRECATED058": 161,
+    "DEPRECATED059": 162,
+    "DEPRECATED060": 163,
+    "DEPRECATED061": 164,
+    "DEPRECATED062": 165,
+    "DEPRECATED063": 166,
+    "DEPRECATED064": 167,
+    "DEPRECATED065": 168,
+    "DEPRECATED066": 169,
+    "DEPRECATED067": 170,
+    "DEPRECATED068": 171,
+    "DEPRECATED069": 172,
+    "DEPRECATED070": 173,
+    "DEPRECATED071": 174,
+    "DEPRECATED072": 175,
+    "DEPRECATED073": 176,
+    "DEPRECATED074": 177,
+    "DEPRECATED075": 178,
+    "DEPRECATED076": 179,
+    "DEPRECATED077": 180,
+    "DEPRECATED078": 181,
+    "DEPRECATED079": 182,
+    "DEPRECATED080": 183,
+    "DEPRECATED081": 184,
+    "DEPRECATED082": 185,
+    "DEPRECATED083": 186,
+    "DEPRECATED084": 187,
+    "DEPRECATED085": 188,
+    "DEPRECATED086": 189,
+    "DEPRECATED087": 190,
+    "DEPRECATED088": 191,
+    "DEPRECATED089": 192,
+    "DEPRECATED090": 193,
+    "DEPRECATED091": 194,
+    "DEPRECATED092": 195,
+    "DEPRECATED093": 196,
+    "DEPRECATED094": 197,
+    "DEPRECATED095": 198,
+    "DEPRECATED096": 199,
+    "DEPRECATED097": 200,
+    "DEPRECATED098": 201,
+    "DEPRECATED099": 202,
+    "DEPRECATED100": 203,
+    "DEPRECATED101": 204,
+    "DEPRECATED102": 205,
+    "DEPRECATED103": 206,
+    "DEPRECATED104": 207,
+    "DEPRECATED105": 208,
+    "DEPRECATED106": 209,
+    "DEPRECATED107": 210,
+    "DEPRECATED108": 211,
+    "DEPRECATED109": 212,
+    "DEPRECATED110": 213,
+    "DEPRECATED111": 214,
+    "DEPRECATED112": 215,
+    "DEPRECATED113": 216,
+    "DEPRECATED114": 217,
+    "DEPRECATED115": 218,
+    "DEPRECATED116": 219,
+    "DEPRECATED117": 220,
+    "DEPRECATED118": 221,
+    "DEPRECATED119": 222,
+    "DEPRECATED120": 223,
+    "DEPRECATED121": 224,
+    "DEPRECATED122": 225,
+    "DEPRECATED123": 226,
+    "DEPRECATED124": 227,
+    "DEPRECATED125": 228,
+    "DEPRECATED126": 229,
+    "DEPRECATED127": 230,
+    "DEPRECATED128": 231,
+    "DEPRECATED129": 232,
+    "DEPRECATED130": 233,
+    "DEPRECATED131": 234,
+    "DEPRECATED132": 235,
+    "DEPRECATED133": 236,
+    "DEPRECATED134": 237,
+    "DEPRECATED135": 238,
+    "DEPRECATED136": 239,
+    "DEPRECATED137": 240,
+    "DEPRECATED138": 241,
+    "DEPRECATED139": 242,
+    "DEPRECATED140": 243,
+    "DEPRECATED141": 244,
+    "DEPRECATED142": 245,
+    "DEPRECATED143": 246,
+    "DEPRECATED144": 247,
+    "DEPRECATED145": 248,
+    "DEPRECATED146": 249,
+    "DEPRECATED147": 250,
+    "DEPRECATED148": 251,
+    "DEPRECATED149": 252,
+    "DEPRECATED150": 253,
+    "DEPRECATED151": 254,
+    "DEPRECATED152": 255,
+    "DEPRECATED153": 256,
+    "DEPRECATED154": 257,
+    "DEPRECATED155": 258,
+    "DEPRECATED156": 259,
+    "DEPRECATED157": 260,
+    "DEPRECATED158": 261,
+    "DEPRECATED159": 262,
+    "DEPRECATED160": 263,
+    "DEPRECATED161": 264,
+    "DEPRECATED162": 265,
+    "DEPRECATED163": 266,
+    "DEPRECATED164": 267,
+    "DEPRECATED165": 268,
+    "DEPRECATED166": 269,
+    "DEPRECATED167": 270,
+    "DEPRECATED168": 271,
+    "DEPRECATED169": 272,
+    "DEPRECATED170": 273,
+    "DEPRECATED171": 274,
+    "DEPRECATED172": 275,
+    "DEPRECATED173": 276,
+    "DEPRECATED174": 277,
+    "DEPRECATED175": 278,
+    "DEPRECATED176": 279,
+    "DEPRECATED177": 280,
+    "DEPRECATED178": 281,
+    "DEPRECATED179": 282,
+    "DEPRECATED180": 283,
+    "DEPRECATED181": 284,
+    "DEPRECATED182": 285,
+    "DEPRECATED183": 286,
+    "DEPRECATED184": 287,
+    "DEPRECATED185": 288,
+    "DEPRECATED186": 289,
+    "DEPRECATED187": 290,
+    "DEPRECATED188": 291,
+    "DEPRECATED189": 292,
+    "DEPRECATED190": 293,
+    "DEPRECATED191": 294,
+    "DEPRECATED192": 295,
+    "DEPRECATED193": 296,
+    "DEPRECATED194": 297,
+    "DEPRECATED195": 298,
+    "DEPRECATED196": 299,
+    "DEPRECATED197": 300,
+    "DEPRECATED198": 301,
+    "DEPRECATED199": 302,
+    "DEPRECATED200": 303,
+    "DEPRECATED201": 304,
+    "DEPRECATED202": 305,
+    "DEPRECATED203": 306,
+    "DEPRECATED204": 307,
+    "DEPRECATED205": 308,
+    "DEPRECATED206": 309,
+    "DEPRECATED207": 310,
+    "DEPRECATED208": 311,
+    "DEPRECATED209": 312,
+    "DEPRECATED210": 313,
+    "DEPRECATED211": 314,
+    "DEPRECATED212": 315,
+    "DEPRECATED213": 316,
+    "DEPRECATED214": 317,
+    "DEPRECATED215": 318,
+    "DEPRECATED216": 319,
+    "DEPRECATED217": 320,
+    "DEPRECATED218": 321,
+    "DEPRECATED219": 322,
+    "DEPRECATED220": 323,
+    "DEPRECATED221": 324,
+    "DEPRECATED222": 325,
+    "DEPRECATED223": 326,
+    "DEPRECATED224": 327,
+    "DEPRECATED225": 328,
+    "DEPRECATED226": 329,
+    "DEPRECATED227": 330,
+    "DEPRECATED228": 331,
+    "DEPRECATED229": 332,
+    "DEPRECATED230": 333,
+    "DEPRECATED231": 334,
+    "DEPRECATED232": 335,
+    "DEPRECATED233": 336,
+    "DEPRECATED234": 337,
+    "DEPRECATED235": 338,
+    "DEPRECATED236": 339,
+    "DEPRECATED237": 340,
+    "DEPRECATED238": 341,
+    "DEPRECATED239": 342,
+    "DEPRECATED240": 343,
+    "DEPRECATED241": 344,
+    "DEPRECATED242": 345,
+    "DEPRECATED243": 346,
+    "DEPRECATED244": 347,
+    "DEPRECATED245": 348,
+    "DEPRECATED246": 349,
+    "DEPRECATED247": 350,
+    "DEPRECATED248": 351,
+    "DEPRECATED249": 352,
+    "DEPRECATED250": 353,
+    "DEPRECATED251": 354,
+    "DEPRECATED252": 355,
+    "DEPRECATED253": 356,
+    "DEPRECATED254": 357,
+    "DEPRECATED255": 358,
+    "DEPRECATED256": 359,
+    "DEPRECATED257": 360,
+    "DEPRECATED258": 361,
+    "DEPRECATED259": 362,
+    "DEPRECATED260": 363,
+    "DEPRECATED261": 364,
+    "DEPRECATED262": 365,
+    "DEPRECATED263": 366,
+    "DEPRECATED264": 367,
+    "DEPRECATED265": 368,
+    "DEPRECATED266": 369,
+    "DEPRECATED267": 370,
+    "DEPRECATED268": 371,
+    "DEPRECATED269": 372,
+    "DEPRECATED270": 373,
+    "DEPRECATED271": 374,
+    "DEPRECATED272": 375,
+    "DEPRECATED273": 376,
+    "DEPRECATED274": 377,
+    "DEPRECATED275": 378,
+    "DEPRECATED276": 379,
+    "PERSON": 380,
+    "NORP": 381,
+    "FACILITY": 382,
+    "ORG": 383,
+    "GPE": 384,
+    "LOC": 385,
+    "PRODUCT": 386,
+    "EVENT": 387,
+    "WORK_OF_ART": 388,
+    "LANGUAGE": 389,
+    "DATE": 391,
+    "TIME": 392,
+    "PERCENT": 393,
+    "MONEY": 394,
+    "QUANTITY": 395,
+    "ORDINAL": 396,
+    "CARDINAL": 397,
+    "acomp": 398,
+    "advcl": 399,
+    "advmod": 400,
+    "agent": 401,
+    "amod": 402,
+    "appos": 403,
+    "attr": 404,
+    "aux": 405,
+    "auxpass": 406,
+    "cc": 407,
+    "ccomp": 408,
+    "complm": 409,
+    "conj": 410,
+    "cop": 411,
+    "csubj": 412,
+    "csubjpass": 413,
+    "dep": 414,
+    "det": 415,
+    "dobj": 416,
+    "expl": 417,
+    "hmod": 418,
+    "hyph": 419,
+    "infmod": 420,
+    "intj": 421,
+    "iobj": 422,
+    "mark": 423,
+    "meta": 424,
+    "neg": 425,
+    "nmod": 426,
+    "nn": 427,
+    "npadvmod": 428,
+    "nsubj": 429,
+    "nsubjpass": 430,
+    "num": 431,
+    "number": 432,
+    "oprd": 433,
+    "obj": 434,
+    "obl": 435,
+    "parataxis": 436,
+    "partmod": 437,
+    "pcomp": 438,
+    "pobj": 439,
+    "poss": 440,
+    "possessive": 441,
+    "preconj": 442,
+    "prep": 443,
+    "prt": 444,
+    "punct": 445,
+    "quantmod": 446,
+    "rcmod": 448,
+    "relcl": 447,
+    "root": 449,
+    "xcomp": 450,
+    "acl": 451,
+    "LAW": 390,
+    "MORPH": 453,
+    "_": 456,
+}
+
+
+def test_frozen_symbols():
+    assert IDS == V3_SYMBOLS
+    assert NAMES == {v: k for k, v in IDS.items()}
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index cee2eda6c53..8db8c1d6f37 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1022,22 +1022,26 @@ cdef class Doc:
                 for id_ in py_attr_ids
             ]
         except KeyError as msg:
-            keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
+            keys = list(IDS.keys())
             raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
         # Make an array from the attributes --- otherwise our inner loop is
         # Python dict iteration.
-        cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i")
-        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
+        cdef Pool mem = Pool()
+        cdef int n_attrs = len(py_attr_ids)
+        cdef attr_id_t* c_attr_ids
+        if n_attrs > 0:
+            c_attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
+            for i, attr_id in enumerate(py_attr_ids):
+                c_attr_ids[i] = attr_id
+        output = numpy.ndarray(shape=(self.length, n_attrs), dtype=numpy.uint64)
         c_output = <attr_t*>output.data
-        c_attr_ids = <attr_id_t*>attr_ids.data
         cdef TokenC* token
-        cdef int nr_attr = attr_ids.shape[0]
         for i in range(self.length):
             token = &self.c[i]
-            for j in range(nr_attr):
-                c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j])
+            for j in range(n_attrs):
+                c_output[i*n_attrs + j] = get_token_attr(token, c_attr_ids[j])
         # Handle 1d case
-        return output if len(attr_ids) >= 2 else output.reshape((self.length,))
+        return output if n_attrs >= 2 else output.reshape((self.length,))
 
     def count_by(self, attr_id_t attr_id, exclude=None, object counts=None):
         """Count the frequencies of a given attribute. Produces a dict of

From 9ff38d07f1b3279d9941d3dbd37a0f70b62b0ec3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 13 Sep 2022 09:51:12 +0200
Subject: [PATCH 128/504] Store activations in `Doc`s when `save_activations`
 is enabled (#11002)

* Store activations in Doc when `store_activations` is enabled

This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.

As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.

* Change type of `store_activations` to `Union[bool, List[str]]`

When the value is:

- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored

* Formatting fixes in Tagger

* Support store_activations in spancat and morphologizer

* Make Doc.activations type visible to MyPy

* textcat/textcat_multilabel: add store_activations option

* trainable_lemmatizer/entity_linker: add store_activations option

* parser/ner: do not currently support returning activations

* Extend tagger and senter tests

So that they, like the other tests, also check that we get no
activations if no activations were requested.

* Document `Doc.activations` and `store_activations` in the relevant pipes

* Start errors/warnings at higher numbers to avoid merge conflicts

Between the master and v4 branches.

* Add `store_activations` to docstrings.

* Replace store_activations setter by set_store_activations method

Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.

* Use dict comprehension suggested by @svlandeg

* Revert "Use dict comprehension suggested by @svlandeg"

This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.

* EntityLinker: add type annotations to _add_activations

* _store_activations: make kwarg-only, remove doc_scores_lens arg

* set_annotations: add type annotations

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* TextCat.predict: return dict

* Make the `TrainablePipe.store_activations` property a bool

This means that we can also bring back `store_activations` setter.

* Remove `TrainablePipe.activations`

We do not need to enumerate the activations anymore since `store_activations` is
`bool`.

* Add type annotations for activations in predict/set_annotations

* Rename `TrainablePipe.store_activations` to `save_activations`

* Error E1400 is not used anymore

This error was used when activations were still `Union[bool, List[str]]`.

* Change wording in API docs after store -> save change

* docs: tag (save_)activations as new in spaCy 4.0

* Fix copied line in morphologizer activations test

* Don't train in any test_save_activations test

* Rename activations

- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
  "guesses" -> "tree_ids".

* Remove unused W400 warning.

This warning was used when we still allowed the user to specify
which activations to save.

* Formatting fixes

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Replace "kb_ids" by a constant

* spancat: replace a cast by an assertion

* Fix EOF spacing

* Fix comments in test_save_activations tests

* Do not set RNG seed in activation saving tests

* Revert "spancat: replace a cast by an assertion"

This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/edit_tree_lemmatizer.py        |  46 ++--
 spacy/pipeline/entity_linker.py               | 244 +++++++++++-------
 spacy/pipeline/morphologizer.pyx              |  37 ++-
 spacy/pipeline/senter.pyx                     |  38 ++-
 spacy/pipeline/spancat.py                     |  84 +++---
 spacy/pipeline/tagger.pyx                     |  43 ++-
 spacy/pipeline/textcat.py                     |  37 ++-
 spacy/pipeline/textcat_multilabel.py          |  23 +-
 spacy/pipeline/trainable_pipe.pxd             |   1 +
 spacy/pipeline/trainable_pipe.pyx             |  14 +-
 .../pipeline/test_edit_tree_lemmatizer.py     |  26 ++
 spacy/tests/pipeline/test_entity_linker.py    |  78 ++++--
 spacy/tests/pipeline/test_morphologizer.py    |  26 +-
 spacy/tests/pipeline/test_senter.py           |  25 ++
 spacy/tests/pipeline/test_spancat.py          |  34 +--
 spacy/tests/pipeline/test_tagger.py           |  24 +-
 spacy/tests/pipeline/test_textcat.py          |  64 +++--
 spacy/tokens/doc.pxd                          |   2 +
 spacy/tokens/doc.pyi                          |   3 +-
 spacy/tokens/doc.pyx                          |   1 +
 website/docs/api/doc.mdx                      |  33 +--
 website/docs/api/edittreelemmatizer.mdx       |  17 +-
 website/docs/api/entitylinker.mdx             |  29 +--
 website/docs/api/morphologizer.mdx            |  18 +-
 website/docs/api/sentencerecognizer.mdx       |  11 +-
 website/docs/api/spancategorizer.mdx          |  35 +--
 website/docs/api/tagger.mdx                   |  14 +-
 website/docs/api/textcategorizer.mdx          |  17 +-
 28 files changed, 669 insertions(+), 355 deletions(-)

diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 4a6174bc3d8..2ef639cad52 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -4,8 +4,8 @@
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints2d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
 from ..errors import Errors
@@ -22,6 +22,9 @@
 TOP_K_GUARDRAIL = 20
 
 
+ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
+
+
 default_model_config = """
 [model]
 @architectures = "spacy.Tagger.v2"
@@ -50,6 +53,7 @@
         "overwrite": False,
         "top_k": 1,
         "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+        "save_activations": False,
     },
     default_score_weights={"lemma_acc": 1.0},
 )
@@ -62,6 +66,7 @@ def make_edit_tree_lemmatizer(
     overwrite: bool,
     top_k: int,
     scorer: Optional[Callable],
+    save_activations: bool,
 ):
     """Construct an EditTreeLemmatizer component."""
     return EditTreeLemmatizer(
@@ -73,6 +78,7 @@ def make_edit_tree_lemmatizer(
         overwrite=overwrite,
         top_k=top_k,
         scorer=scorer,
+        save_activations=save_activations,
     )
 
 
@@ -92,6 +98,7 @@ def __init__(
         overwrite: bool = False,
         top_k: int = 1,
         scorer: Optional[Callable] = lemmatizer_score,
+        save_activations: bool = False,
     ):
         """
         Construct an edit tree lemmatizer.
@@ -103,6 +110,7 @@ def __init__(
             frequency in the training data.
         overwrite (bool): overwrite existing lemma annotations.
         top_k (int): try to apply at most the k most probable edit trees.
+        save_activations (bool): save model activations in Doc when annotating.
         """
         self.vocab = vocab
         self.model = model
@@ -117,7 +125,7 @@ def __init__(
 
         self.cfg: Dict[str, Any] = {"labels": []}
         self.scorer = scorer
-        self.numpy_ops = NumpyOps()
+        self.save_activations = save_activations
 
     def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
@@ -146,31 +154,24 @@ def get_loss(
 
         return float(loss), d_scores
 
-    def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
-        if self.top_k == 1:
-            scores2guesses = self._scores2guesses_top_k_equals_1
-        elif self.top_k <= TOP_K_GUARDRAIL:
-            scores2guesses = self._scores2guesses_top_k_greater_1
-        else:
-            scores2guesses = self._scores2guesses_top_k_guardrail
-        # The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
-        # of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
-        # for its principal purpose of lemmatizing tokens. However, the code could also
-        # be used for other purposes, and with very large values of *top_k* the method
-        # becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
-        # instead.
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         n_docs = len(list(docs))
         if not any(len(doc) for doc in docs):
             # Handle cases where there are no tokens in any docs.
             n_labels = len(self.cfg["labels"])
-            guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
+            guesses: List[Ints1d] = [
+                self.model.ops.alloc((0,), dtype="i") for doc in docs
+            ]
+            scores: List[Floats2d] = [
+                self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
+            ]
             assert len(guesses) == n_docs
-            return guesses
+            return {"probabilities": scores, "tree_ids": guesses}
         scores = self.model.predict(docs)
         assert len(scores) == n_docs
         guesses = scores2guesses(docs, scores)
         assert len(guesses) == n_docs
-        return guesses
+        return {"probabilities": scores, "tree_ids": guesses}
 
     def _scores2guesses_top_k_equals_1(self, docs, scores):
         guesses = []
@@ -230,8 +231,13 @@ def _scores2guesses_top_k_guardrail(self, docs, scores):
 
         return guesses
 
-    def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
+        batch_tree_ids = activations["tree_ids"]
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             doc_tree_ids = batch_tree_ids[i]
             if hasattr(doc_tree_ids, "get"):
                 doc_tree_ids = doc_tree_ids.get()
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index a730ece1bfa..bab79282d5b 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,3 +1,10 @@
+from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
+from typing import cast
+from numpy import dtype
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
+from pathlib import Path
+from itertools import islice
+import srsly
 import random
 from itertools import islice
 from pathlib import Path
@@ -21,6 +28,11 @@
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 
+
+ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
+
+KNOWLEDGE_BASE_IDS = "kb_ids"
+
 # See #9050
 BACKWARD_OVERWRITE = True
 
@@ -60,6 +72,7 @@
         "use_gold_ents": True,
         "candidates_batch_size": 1,
         "threshold": None,
+        "save_activations": False,
     },
     default_score_weights={
         "nel_micro_f": 1.0,
@@ -87,6 +100,7 @@ def make_entity_linker(
     use_gold_ents: bool,
     candidates_batch_size: int,
     threshold: Optional[float] = None,
+    save_activations: bool,
 ):
     """Construct an EntityLinker component.
 
@@ -110,6 +124,7 @@ def make_entity_linker(
     candidates_batch_size (int): Size of batches for entity candidate generation.
     threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
         prediction is discarded. If None, predictions are not filtered by any threshold.
+    save_activations (bool): save model activations in Doc when annotating.
     """
 
     if not model.attrs.get("include_span_maker", False):
@@ -144,6 +159,7 @@ def make_entity_linker(
         use_gold_ents=use_gold_ents,
         candidates_batch_size=candidates_batch_size,
         threshold=threshold,
+        save_activations=save_activations,
     )
 
 
@@ -185,6 +201,7 @@ def __init__(
         use_gold_ents: bool,
         candidates_batch_size: int,
         threshold: Optional[float] = None,
+        save_activations: bool = False,
     ) -> None:
         """Initialize an entity linker.
 
@@ -239,6 +256,7 @@ def __init__(
         self.use_gold_ents = use_gold_ents
         self.candidates_batch_size = candidates_batch_size
         self.threshold = threshold
+        self.save_activations = save_activations
 
         if candidates_batch_size < 1:
             raise ValueError(Errors.E1044)
@@ -427,7 +445,7 @@ def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
         loss = loss / len(entity_encodings)
         return float(loss), out
 
-    def predict(self, docs: Iterable[Doc]) -> List[str]:
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
         Returns the KB IDs for each entity in each doc, including NIL if there is
         no prediction.
@@ -440,129 +458,138 @@ def predict(self, docs: Iterable[Doc]) -> List[str]:
         self.validate_kb()
         entity_count = 0
         final_kb_ids: List[str] = []
-        xp = self.model.ops.xp
+        ops = self.model.ops
+        xp = ops.xp
+        docs_ents: List[Ragged] = []
+        docs_scores: List[Ragged] = []
         if not docs:
-            return final_kb_ids
+            return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
         if isinstance(docs, Doc):
             docs = [docs]
-        for i, doc in enumerate(docs):
+        for doc in docs:
+            doc_ents: List[Ints1d] = []
+            doc_scores: List[Floats1d] = []
             if len(doc) == 0:
+                docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
+                docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
                 continue
             sentences = [s for s in doc.sents]
 
-            # Loop over entities in batches.
-            for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
-                ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
-
-                # Look up candidate entities.
-                valid_ent_idx = [
-                    idx
-                    for idx in range(len(ent_batch))
-                    if ent_batch[idx].label_ not in self.labels_discard
-                ]
-
-                batch_candidates = list(
-                    self.get_candidates_batch(
-                        self.kb, [ent_batch[idx] for idx in valid_ent_idx]
-                    )
-                    if self.candidates_batch_size > 1
-                    else [
-                        self.get_candidates(self.kb, ent_batch[idx])
-                        for idx in valid_ent_idx
-                    ]
-                )
-
-                # Looping through each entity in batch (TODO: rewrite)
-                for j, ent in enumerate(ent_batch):
-                    assert hasattr(ent, "sents")
-                    sents = list(ent.sents)
-                    sent_indices = (
-                        sentences.index(sents[0]),
-                        sentences.index(sents[-1]),
+                if self.incl_context:
+                    # get n_neighbour sentences, clipped to the length of the document
+                    start_sentence = max(0, sent_index - self.n_sents)
+                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
+                    start_token = sentences[start_sentence].start
+                    end_token = sentences[end_sentence].end
+                    sent_doc = doc[start_token:end_token].as_doc()
+                    # currently, the context is the same for each entity in a sentence (should be refined)
+                    sentence_encoding = self.model.predict([sent_doc])[0]
+                    sentence_encoding_t = sentence_encoding.T
+                    sentence_norm = xp.linalg.norm(sentence_encoding_t)
+                entity_count += 1
+                if ent.label_ in self.labels_discard:
+                    # ignoring this entity - setting to NIL
+                    final_kb_ids.append(self.NIL)
+                    self._add_activations(
+                        doc_scores=doc_scores,
+                        doc_ents=doc_ents,
+                        scores=[0.0],
+                        ents=[0],
                     )
-                    assert sent_indices[1] >= sent_indices[0] >= 0
-
-                    if self.incl_context:
-                        # get n_neighbour sentences, clipped to the length of the document
-                        start_sentence = max(0, sent_indices[0] - self.n_sents)
-                        end_sentence = min(
-                            len(sentences) - 1, sent_indices[1] + self.n_sents
-                        )
-                        start_token = sentences[start_sentence].start
-                        end_token = sentences[end_sentence].end
-                        sent_doc = doc[start_token:end_token].as_doc()
-
-                        # currently, the context is the same for each entity in a sentence (should be refined)
-                        sentence_encoding = self.model.predict([sent_doc])[0]
-                        sentence_encoding_t = sentence_encoding.T
-                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                    entity_count += 1
-                    if ent.label_ in self.labels_discard:
-                        # ignoring this entity - setting to NIL
+                else:
+                    candidates = list(self.get_candidates(self.kb, ent))
+                    if not candidates:
+                        # no prediction possible for this entity - setting to NIL
                         final_kb_ids.append(self.NIL)
+                        self._add_activations(
+                            doc_scores=doc_scores,
+                            doc_ents=doc_ents,
+                            scores=[0.0],
+                            ents=[0],
+                        )
+                    elif len(candidates) == 1 and self.threshold is None:
+                        # shortcut for efficiency reasons: take the 1 candidate
+                        final_kb_ids.append(candidates[0].entity_)
+                        self._add_activations(
+                            doc_scores=doc_scores,
+                            doc_ents=doc_ents,
+                            scores=[1.0],
+                            ents=[candidates[0].entity_],
+                        )
                     else:
-                        candidates = list(batch_candidates[j])
-                        if not candidates:
-                            # no prediction possible for this entity - setting to NIL
-                            final_kb_ids.append(self.NIL)
-                        elif len(candidates) == 1 and self.threshold is None:
-                            # shortcut for efficiency reasons: take the 1 candidate
-                            final_kb_ids.append(candidates[0].entity_)
-                        else:
-                            random.shuffle(candidates)
-                            # set all prior probabilities to 0 if incl_prior=False
-                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
-                            if not self.incl_prior:
-                                prior_probs = xp.asarray([0.0 for _ in candidates])
-                            scores = prior_probs
-                            # add in similarity from the context
-                            if self.incl_context:
-                                entity_encodings = xp.asarray(
-                                    [c.entity_vector for c in candidates]
-                                )
-                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
-                                if len(entity_encodings) != len(prior_probs):
-                                    raise RuntimeError(
-                                        Errors.E147.format(
-                                            method="predict",
-                                            msg="vectors not of equal length",
-                                        )
+                        random.shuffle(candidates)
+                        # set all prior probabilities to 0 if incl_prior=False
+                        prior_probs = xp.asarray([c.prior_prob for c in candidates])
+                        if not self.incl_prior:
+                            prior_probs = xp.asarray([0.0 for _ in candidates])
+                        scores = prior_probs
+                        # add in similarity from the context
+                        if self.incl_context:
+                            entity_encodings = xp.asarray(
+                                [c.entity_vector for c in candidates]
+                            )
+                            entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+                            if len(entity_encodings) != len(prior_probs):
+                                raise RuntimeError(
+                                    Errors.E147.format(
+                                        method="predict",
+                                        msg="vectors not of equal length",
                                     )
-                                # cosine similarity
-                                sims = xp.dot(entity_encodings, sentence_encoding_t) / (
-                                    sentence_norm * entity_norm
                                 )
-                                if sims.shape != prior_probs.shape:
-                                    raise ValueError(Errors.E161)
-                                scores = prior_probs + sims - (prior_probs * sims)
-                            final_kb_ids.append(
-                                candidates[scores.argmax().item()].entity_
-                                if self.threshold is None
-                                or scores.max() >= self.threshold
-                                else EntityLinker.NIL
+                            # cosine similarity
+                            sims = xp.dot(entity_encodings, sentence_encoding_t) / (
+                                sentence_norm * entity_norm
                             )
-
+                            if sims.shape != prior_probs.shape:
+                                raise ValueError(Errors.E161)
+                            scores = prior_probs + sims - (prior_probs * sims)
+                        final_kb_ids.append(
+                            candidates[scores.argmax().item()].entity_
+                            if self.threshold is None or scores.max() >= self.threshold
+                            else EntityLinker.NIL
+                        )
+                        self._add_activations(
+                            doc_scores=doc_scores,
+                            doc_ents=doc_ents,
+                            scores=scores,
+                            ents=[c.entity for c in candidates],
+                        )
+            self._add_doc_activations(
+                docs_scores=docs_scores,
+                docs_ents=docs_ents,
+                doc_scores=doc_scores,
+                doc_ents=doc_ents,
+            )
         if not (len(final_kb_ids) == entity_count):
             err = Errors.E147.format(
                 method="predict", msg="result variables not of equal length"
             )
             raise RuntimeError(err)
-        return final_kb_ids
+        return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
 
-    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced
+                                 by EntityLinker.predict.
 
         DOCS: https://spacy.io/api/entitylinker#set_annotations
         """
+        kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
         count_ents = len([ent for doc in docs for ent in doc.ents])
         if count_ents != len(kb_ids):
             raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
         i = 0
         overwrite = self.cfg["overwrite"]
-        for doc in docs:
+        for j, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    if act_name != KNOWLEDGE_BASE_IDS:
+                        # We only copy activations that are Ragged.
+                        doc.activations[self.name][act_name] = cast(Ragged, acts[j])
+
             for ent in doc.ents:
                 kb_id = kb_ids[i]
                 i += 1
@@ -661,3 +688,32 @@ def rehearse(self, examples, *, sgd=None, losses=None, **config):
 
     def add_label(self, label):
         raise NotImplementedError
+
+    def _add_doc_activations(
+        self,
+        *,
+        docs_scores: List[Ragged],
+        docs_ents: List[Ragged],
+        doc_scores: List[Floats1d],
+        doc_ents: List[Ints1d],
+    ):
+        if not self.save_activations:
+            return
+        ops = self.model.ops
+        lengths = ops.asarray1i([s.shape[0] for s in doc_scores])
+        docs_scores.append(Ragged(ops.flatten(doc_scores), lengths))
+        docs_ents.append(Ragged(ops.flatten(doc_ents), lengths))
+
+    def _add_activations(
+        self,
+        *,
+        doc_scores: List[Floats1d],
+        doc_ents: List[Ints1d],
+        scores: Sequence[float],
+        ents: Sequence[int],
+    ):
+        if not self.save_activations:
+            return
+        ops = self.model.ops
+        doc_scores.append(ops.asarray1f(scores))
+        doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index bdbe75fd824..cc8f87936b9 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,4 +1,8 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Dict, Iterable, List, Optional, Union
+import srsly
+from thinc.api import SequenceCategoricalCrossentropy, Model, Config
+from thinc.types import Floats2d, Ints1d
 from itertools import islice
 from typing import Callable, Dict, Optional, Union
 
@@ -8,6 +12,12 @@ from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 
+from ..parts_of_speech import IDS as POS_IDS
+from ..symbols import POS
+from ..language import Language
+from ..errors import Errors
+from .pipe import deserialize_config
+from .tagger import ActivationsT, Tagger
 from .. import util
 from ..errors import Errors
 from ..language import Language
@@ -50,8 +60,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "morphologizer",
     assigns=["token.morph", "token.pos"],
-    default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False,
-                    "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0},
+    default_config={
+        "model": DEFAULT_MORPH_MODEL,
+        "overwrite": True,
+        "extend": False,
+        "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
+        "save_activations": False,
+    },
     default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
@@ -62,8 +77,10 @@ def make_morphologizer(
     extend: bool,
     label_smoothing: float,
     scorer: Optional[Callable],
+    save_activations: bool,
 ):
-    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer)
+    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
+                         save_activations=save_activations)
 
 
 def morphologizer_score(examples, **kwargs):
@@ -99,6 +116,7 @@ class Morphologizer(Tagger):
         extend: bool = BACKWARD_EXTEND,
         label_smoothing: float = 0.0,
         scorer: Optional[Callable] = morphologizer_score,
+        save_activations: bool = False,
     ):
         """Initialize a morphologizer.
 
@@ -109,6 +127,7 @@ class Morphologizer(Tagger):
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attributes "pos" and "morph" and
             Scorer.score_token_attr_per_feat for the attribute "morph".
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/morphologizer#init
         """
@@ -129,6 +148,7 @@ class Morphologizer(Tagger):
         }
         self.cfg = dict(sorted(cfg.items()))
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -222,14 +242,15 @@ class Morphologizer(Tagger):
         assert len(label_sample) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=doc_sample, Y=label_sample)
 
-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
 
         DOCS: https://spacy.io/api/morphologizer#set_annotations
         """
+        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
@@ -240,6 +261,10 @@ class Morphologizer(Tagger):
         # to allocate a compatible container out of the iterable.
         labels = tuple(self.labels)
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index df093baa9c6..521afe1d181 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,12 +1,16 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+from typing import Dict, Iterable, Optional, Callable, List, Union
 from itertools import islice
 from typing import Callable, Optional
 
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+import srsly
+from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
 
-from .. import util
+from .tagger import ActivationsT, Tagger
+from ..language import Language
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
@@ -37,11 +41,21 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "senter",
     assigns=["token.is_sent_start"],
-    default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
+    default_config={
+        "model": DEFAULT_SENTER_MODEL,
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.senter_scorer.v1"},
+        "save_activations": False,
+    },
     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
-def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
-    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
+def make_senter(nlp: Language,
+                name: str,
+                model: Model,
+                overwrite: bool,
+                scorer: Optional[Callable],
+                save_activations: bool):
+    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
 
 
 def senter_score(examples, **kwargs):
@@ -71,6 +85,7 @@ class SentenceRecognizer(Tagger):
         *,
         overwrite=BACKWARD_OVERWRITE,
         scorer=senter_score,
+        save_activations: bool = False,
     ):
         """Initialize a sentence recognizer.
 
@@ -80,6 +95,7 @@ class SentenceRecognizer(Tagger):
             losses during training.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_spans for the attribute "sents".
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/sentencerecognizer#init
         """
@@ -89,6 +105,7 @@ class SentenceRecognizer(Tagger):
         self._rehearsal_model = None
         self.cfg = {"overwrite": overwrite}
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -106,19 +123,24 @@ class SentenceRecognizer(Tagger):
     def label_data(self):
         return None
 
-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
 
         DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
         """
+        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
         cdef bint overwrite = self.cfg["overwrite"]
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 08a5478a912..1450bb5d6cb 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,6 +1,8 @@
-from dataclasses import dataclass
-from functools import partial
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
+from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
+from typing import Union
+from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
+from thinc.api import Optimizer
+from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
 
 import numpy
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
@@ -16,6 +18,9 @@
 from ..vocab import Vocab
 from .trainable_pipe import TrainablePipe
 
+ActivationsT = Dict[str, Union[Floats2d, Ragged]]
+
+
 spancat_default_config = """
 [model]
 @architectures = "spacy.SpanCategorizer.v1"
@@ -170,6 +175,7 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester:
         "model": DEFAULT_SPANCAT_MODEL,
         "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
         "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
+        "save_activations": False,
     },
     default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@@ -182,6 +188,7 @@ def make_spancat(
     scorer: Optional[Callable],
     threshold: float,
     max_positive: Optional[int],
+    save_activations: bool,
 ) -> "SpanCategorizer":
     """Create a SpanCategorizer component and configure it for multi-label
     classification to be able to assign multiple labels for each span.
@@ -209,6 +216,7 @@ def make_spancat(
         0.5.
     max_positive (Optional[int]): Maximum number of labels to consider positive
         per span. Defaults to None, indicating no limit.
+        save_activations (bool): save model activations in Doc when annotating.
     """
     return SpanCategorizer(
         nlp.vocab,
@@ -287,6 +295,7 @@ def make_spancat_singlelabel(
         add_negative_label=True,
         threshold=None,
         scorer=scorer,
+        save_activations=save_activations,
     )
 
 
@@ -349,6 +358,7 @@ def __init__(
         max_positive: Optional[int] = None,
         threshold: Optional[float] = 0.5,
         scorer: Optional[Callable] = spancat_score,
+        save_activations: bool = False,
     ) -> None:
         """Initialize the multi-label or multi-class span categorizer.
 
@@ -398,9 +408,7 @@ def __init__(
         self.model = model
         self.name = name
         self.scorer = scorer
-        self.add_negative_label = add_negative_label
-        if not allow_overlap and max_positive is not None and max_positive > 1:
-            raise ValueError(Errors.E1051.format(max_positive=max_positive))
+        self.save_activations = save_activations
 
     @property
     def key(self) -> str:
@@ -458,28 +466,7 @@ def label_data(self) -> List[str]:
         """
         return list(self.labels)
 
-    @property
-    def _label_map(self) -> Dict[str, int]:
-        """RETURNS (Dict[str, int]): The label map."""
-        return {label: i for i, label in enumerate(self.labels)}
-
-    @property
-    def _n_labels(self) -> int:
-        """RETURNS (int): Number of labels."""
-        if self.add_negative_label:
-            return len(self.labels) + 1
-        else:
-            return len(self.labels)
-
-    @property
-    def _negative_label_i(self) -> Union[int, None]:
-        """RETURNS (Union[int, None]): Index of the negative label."""
-        if self.add_negative_label:
-            return len(self.label_data)
-        else:
-            return None
-
-    def predict(self, docs: Iterable[Doc]):
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
 
         docs (Iterable[Doc]): The documents to predict.
@@ -488,11 +475,8 @@ def predict(self, docs: Iterable[Doc]):
         DOCS: https://spacy.io/api/spancategorizer#predict
         """
         indices = self.suggester(docs, ops=self.model.ops)
-        if indices.lengths.sum() == 0:
-            scores = self.model.ops.alloc2f(0, 0)
-        else:
-            scores = self.model.predict((docs, indices))  # type: ignore
-        return indices, scores
+        scores = self.model.predict((docs, indices))  # type: ignore
+        return {"indices": indices, "scores": scores}
 
     def set_candidates(
         self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@@ -512,32 +496,32 @@ def set_candidates(
             for index in candidates.dataXd:
                 doc.spans[candidates_key].append(doc[index[0] : index[1]])
 
-    def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of Doc objects, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        scores: The scores to set, produced by SpanCategorizer.predict.
+        activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
 
         DOCS: https://spacy.io/api/spancategorizer#set_annotations
         """
-        indices, scores = indices_scores
+        labels = self.labels
+
+        indices = activations["indices"]
+        assert isinstance(indices, Ragged)
+        scores = cast(Floats2d, activations["scores"])
+
         offset = 0
         for i, doc in enumerate(docs):
             indices_i = indices[i].dataXd
-            allow_overlap = cast(bool, self.cfg["allow_overlap"])
-            if self.cfg["max_positive"] == 1:
-                doc.spans[self.key] = self._make_span_group_singlelabel(
-                    doc,
-                    indices_i,
-                    scores[offset : offset + indices.lengths[i]],
-                    allow_overlap,
-                )
-            else:
-                doc.spans[self.key] = self._make_span_group_multilabel(
-                    doc,
-                    indices_i,
-                    scores[offset : offset + indices.lengths[i]],
-                )
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                doc.activations[self.name]["indices"] = indices_i
+                doc.activations[self.name]["scores"] = scores[
+                    offset : offset + indices.lengths[i]
+                ]
+            doc.spans[self.key] = self._make_span_group(
+                doc, indices_i, scores[offset : offset + indices.lengths[i]], labels  # type: ignore[arg-type]
+            )
             offset += indices.lengths[i]
 
     def update(
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 34e85d49c2b..8ecd0c46ee0 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,4 +1,10 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Dict, Iterable, List, Optional, Union
+import numpy
+import srsly
+from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
+import warnings
 from itertools import islice
 from typing import Callable, Optional
 
@@ -15,6 +21,9 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .trainable_pipe import TrainablePipe
 
+
+ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
+
 # See #9050
 BACKWARD_OVERWRITE = False
 
@@ -38,7 +47,13 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "tagger",
     assigns=["token.tag"],
-    default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0},
+    default_config={
+        "model": DEFAULT_TAGGER_MODEL,
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.tagger_scorer.v1"},
+        "neg_prefix": "!",
+        "save_activations": False,
+    },
     default_score_weights={"tag_acc": 1.0},
 )
 def make_tagger(
@@ -48,7 +63,7 @@ def make_tagger(
     overwrite: bool,
     scorer: Optional[Callable],
     neg_prefix: str,
-    label_smoothing: float,
+    save_activations: bool,
 ):
     """Construct a part-of-speech tagger component.
 
@@ -57,7 +72,8 @@ def make_tagger(
         in size, and be normalized as probabilities (all scores between 0 and 1,
         with the rows summing to 1).
     """
-    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, label_smoothing=label_smoothing)
+    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
+                  save_activations=save_activations)
 
 
 def tagger_score(examples, **kwargs):
@@ -83,7 +99,7 @@ class Tagger(TrainablePipe):
         overwrite=BACKWARD_OVERWRITE,
         scorer=tagger_score,
         neg_prefix="!",
-        label_smoothing=0.0,
+        save_activations: bool = False,
     ):
         """Initialize a part-of-speech tagger.
 
@@ -93,6 +109,7 @@ class Tagger(TrainablePipe):
             losses during training.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attribute "tag".
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/tagger#init
         """
@@ -103,6 +120,7 @@ class Tagger(TrainablePipe):
         cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix, "label_smoothing": label_smoothing}
         self.cfg = dict(sorted(cfg.items()))
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -121,7 +139,7 @@ class Tagger(TrainablePipe):
         """Data about the labels currently added to the component."""
         return tuple(self.cfg["labels"])
 
-    def predict(self, docs):
+    def predict(self, docs) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
 
         docs (Iterable[Doc]): The documents to predict.
@@ -134,12 +152,12 @@ class Tagger(TrainablePipe):
             n_labels = len(self.labels)
             guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
             assert len(guesses) == len(docs)
-            return guesses
+            return {"probabilities": guesses, "label_ids": guesses}
         scores = self.model.predict(docs)
         assert len(scores) == len(docs), (len(scores), len(docs))
         guesses = self._scores2guesses(scores)
         assert len(guesses) == len(docs)
-        return guesses
+        return {"probabilities": scores, "label_ids": guesses}
 
     def _scores2guesses(self, scores):
         guesses = []
@@ -150,20 +168,25 @@ class Tagger(TrainablePipe):
             guesses.append(doc_guesses)
         return guesses
 
-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by Tagger.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
 
         DOCS: https://spacy.io/api/tagger#set_annotations
         """
+        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
         cdef bint overwrite = self.cfg["overwrite"]
         labels = self.labels
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index ae227017a9f..6cb33109891 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
+from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
+from thinc.types import Floats2d
+import numpy
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
 
@@ -14,6 +18,9 @@
 from ..vocab import Vocab
 from .trainable_pipe import TrainablePipe
 
+ActivationsT = Dict[str, Floats2d]
+
+
 single_label_default_config = """
 [model]
 @architectures = "spacy.TextCatEnsemble.v2"
@@ -80,7 +87,8 @@
     default_config={
         "threshold": 0.0,
         "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
-        "scorer": {"@scorers": "spacy.textcat_scorer.v2"},
+        "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
+        "save_activations": False,
     },
     default_score_weights={
         "cats_score": 1.0,
@@ -101,6 +109,7 @@ def make_textcat(
     model: Model[List[Doc], List[Floats2d]],
     threshold: float,
     scorer: Optional[Callable],
+    save_activations: bool,
 ) -> "TextCategorizer":
     """Create a TextCategorizer component. The text categorizer predicts categories
     over a whole document. It can learn one or more labels, and the labels are considered
@@ -110,8 +119,16 @@ def make_textcat(
         scores for each category.
     threshold (float): Cutoff to consider a prediction "positive".
     scorer (Optional[Callable]): The scoring method.
+    save_activations (bool): save model activations in Doc when annotating.
     """
-    return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
+    return TextCategorizer(
+        nlp.vocab,
+        model,
+        name,
+        threshold=threshold,
+        scorer=scorer,
+        save_activations=save_activations,
+    )
 
 
 def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
@@ -142,6 +159,7 @@ def __init__(
         *,
         threshold: float,
         scorer: Optional[Callable] = textcat_score,
+        save_activations: bool = False,
     ) -> None:
         """Initialize a text categorizer for single-label classification.
 
@@ -167,6 +185,7 @@ def __init__(
         }
         self.cfg = dict(cfg)
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def support_missing_values(self):
@@ -191,7 +210,7 @@ def label_data(self) -> List[str]:
         """
         return self.labels  # type: ignore[return-value]
 
-    def predict(self, docs: Iterable[Doc]):
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
 
         docs (Iterable[Doc]): The documents to predict.
@@ -204,12 +223,12 @@ def predict(self, docs: Iterable[Doc]):
             tensors = [doc.tensor for doc in docs]
             xp = self.model.ops.xp
             scores = xp.zeros((len(list(docs)), len(self.labels)))
-            return scores
+            return {"probabilities": scores}
         scores = self.model.predict(docs)
         scores = self.model.ops.asarray(scores)
-        return scores
+        return {"probabilities": scores}
 
-    def set_annotations(self, docs: Iterable[Doc], scores) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of Doc objects, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
@@ -217,9 +236,13 @@ def set_annotations(self, docs: Iterable[Doc], scores) -> None:
 
         DOCS: https://spacy.io/api/textcategorizer#set_annotations
         """
+        probs = activations["probabilities"]
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                doc.activations[self.name]["probabilities"] = probs[i]
             for j, label in enumerate(self.labels):
-                doc.cats[label] = float(scores[i, j])
+                doc.cats[label] = float(probs[i, j])
 
     def update(
         self,
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 2f8d5e60437..ac024ba3639 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Optional, Dict, List, Callable, Any, Union
+from thinc.types import Floats2d
+from thinc.api import Model, Config
+
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional
 
@@ -78,7 +82,8 @@
     default_config={
         "threshold": 0.5,
         "model": DEFAULT_MULTI_TEXTCAT_MODEL,
-        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
+        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
+        "save_activations": False,
     },
     default_score_weights={
         "cats_score": 1.0,
@@ -99,8 +104,9 @@ def make_multilabel_textcat(
     model: Model[List[Doc], List[Floats2d]],
     threshold: float,
     scorer: Optional[Callable],
-) -> "MultiLabel_TextCategorizer":
-    """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
+    save_activations: bool,
+) -> "TextCategorizer":
+    """Create a TextCategorizer component. The text categorizer predicts categories
     over a whole document. It can learn one or more labels, and the labels are considered
     to be non-mutually exclusive, which means that there can be zero or more labels
     per doc).
@@ -111,7 +117,12 @@ def make_multilabel_textcat(
     scorer (Optional[Callable]): The scoring method.
     """
     return MultiLabel_TextCategorizer(
-        nlp.vocab, model, name, threshold=threshold, scorer=scorer
+        nlp.vocab,
+        model,
+        name,
+        threshold=threshold,
+        scorer=scorer,
+        save_activations=save_activations,
     )
 
 
@@ -143,6 +154,7 @@ def __init__(
         *,
         threshold: float,
         scorer: Optional[Callable] = textcat_multilabel_score,
+        save_activations: bool = False,
     ) -> None:
         """Initialize a text categorizer for multi-label classification.
 
@@ -151,7 +163,7 @@ def __init__(
         name (str): The component instance name, used to add entries to the
             losses during training.
         threshold (float): Cutoff to consider a prediction "positive".
-        scorer (Optional[Callable]): The scoring method.
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/textcategorizer#init
         """
@@ -162,6 +174,7 @@ def __init__(
         cfg = {"labels": [], "threshold": threshold}
         self.cfg = dict(cfg)
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def support_missing_values(self):
diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd
index b1d2550a1ce..3e9a0a9584d 100644
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@@ -7,3 +7,4 @@ cdef class TrainablePipe(Pipe):
     cdef public object model
     cdef public object cfg
     cdef public object scorer
+    cdef bint _save_activations
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 8f219b32797..bd360c9501b 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -2,10 +2,14 @@
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 
 import srsly
-from thinc.api import Model, Optimizer, set_dropout_rate
+from thinc.api import set_dropout_rate, Model, Optimizer
+import warnings
 
 from ..tokens.doc cimport Doc
 
+from ..training import validate_examples
+from ..errors import Errors, Warnings
+from .pipe import Pipe, deserialize_config
 from .. import util
 from ..errors import Errors
 from ..language import Language
@@ -342,3 +346,11 @@ cdef class TrainablePipe(Pipe):
         deserialize["model"] = load_model
         util.from_disk(path, deserialize, exclude)
         return self
+
+    @property
+    def save_activations(self):
+        return self._save_activations
+
+    @save_activations.setter
+    def save_activations(self, save_activations: bool):
+        self._save_activations = save_activations
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index 5a8f0aee2ab..ba2ed4e5ff3 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -1,3 +1,4 @@
+from typing import cast
 import pickle
 
 import hypothesis.strategies as st
@@ -8,6 +9,8 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
+from spacy.pipeline.trainable_pipe import TrainablePipe
+from spacy.training import Example
 from spacy.strings import StringStore
 from spacy.training import Example
 from spacy.util import make_tempdir
@@ -331,3 +334,26 @@ def test_empty_strings():
     no_change = trees.add("xyz", "xyz")
     empty = trees.add("", "")
     assert no_change == empty
+
+
+def test_save_activations():
+    nlp = English()
+    lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer"))
+    lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = lemmatizer.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "trainable_lemmatizer" not in doc.activations
+
+    lemmatizer.save_activations = True
+    doc = nlp("This is a test.")
+    assert list(doc.activations["trainable_lemmatizer"].keys()) == [
+        "probabilities",
+        "tree_ids",
+    ]
+    assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
+    assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 00771a0f0f8..844bacb3b1f 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,7 +1,8 @@
-from typing import Any, Callable, Dict, Iterable, Tuple
+from typing import Callable, Iterable, Dict, Any, cast
 
 import pytest
 from numpy.testing import assert_equal
+from thinc.types import Ragged
 
 from spacy import Language, registry, util
 from spacy.attrs import ENT_KB_ID
@@ -9,8 +10,7 @@
 from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates
 from spacy.lang.en import English
 from spacy.ml import load_kb
-from spacy.ml.models.entity_linker import build_span_maker
-from spacy.pipeline import EntityLinker
+from spacy.pipeline import EntityLinker, TrainablePipe
 from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
@@ -1194,16 +1194,64 @@ def create_kb(vocab):
     assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
 
 
-def test_span_maker_forward_with_empty():
-    """The forward pass of the span maker may have a doc with no entities."""
+def test_save_activations():
     nlp = English()
-    doc1 = nlp("a b c")
-    ent = doc1[0:1]
-    ent.label_ = "X"
-    doc1.ents = [ent]
-    # no entities
-    doc2 = nlp("x y z")
-
-    # just to get a model
-    span_maker = build_span_maker()
-    span_maker([doc1, doc2], False)
+    vector_length = 3
+    assert "Q2146908" not in nlp.vocab.strings
+
+    # Convert the texts to docs to make sure we have doc.ents set for the training examples
+    train_examples = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp(text)
+        train_examples.append(Example.from_dict(doc, annotation))
+
+    def create_kb(vocab):
+        # create artificial KB - assign same prior weight to the two russ cochran's
+        # Q2146908 (Russ Cochran): American golfer
+        # Q7381115 (Russ Cochran): publisher
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
+        mykb.add_alias(
+            alias="Russ Cochran",
+            entities=["Q2146908", "Q7381115"],
+            probabilities=[0.5, 0.5],
+        )
+        return mykb
+
+    # Create the Entity Linker component and add it to the pipeline
+    entity_linker = cast(TrainablePipe, nlp.add_pipe("entity_linker", last=True))
+    assert isinstance(entity_linker, EntityLinker)
+    entity_linker.set_kb(create_kb)
+    assert "Q2146908" in entity_linker.vocab.strings
+    assert "Q2146908" in entity_linker.kb.vocab.strings
+
+    # initialize the NEL pipe
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    nO = entity_linker.model.get_dim("nO")
+
+    nlp.add_pipe("sentencizer", first=True)
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
+        {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]},
+    ]
+    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+    ruler.add_patterns(patterns)
+
+    doc = nlp("Russ Cochran was a publisher")
+    assert "entity_linker" not in doc.activations
+
+    entity_linker.save_activations = True
+    doc = nlp("Russ Cochran was a publisher")
+    assert set(doc.activations["entity_linker"].keys()) == {"ents", "scores"}
+    ents = doc.activations["entity_linker"]["ents"]
+    assert isinstance(ents, Ragged)
+    assert ents.data.shape == (2, 1)
+    assert ents.data.dtype == "uint64"
+    assert ents.lengths.shape == (1,)
+    scores = doc.activations["entity_linker"]["scores"]
+    assert isinstance(scores, Ragged)
+    assert scores.data.shape == (2, 1)
+    assert scores.data.dtype == "float32"
+    assert scores.lengths.shape == (1,)
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 0d895f23688..c2b65977ac3 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -1,3 +1,4 @@
+from typing import cast
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import get_current_ops
@@ -7,7 +8,8 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.morphology import Morphology
-from spacy.tests.util import make_tempdir
+from spacy.pipeline import TrainablePipe
+from spacy.attrs import MORPH
 from spacy.tokens import Doc
 from spacy.training import Example
 
@@ -224,3 +226,25 @@ def test_overfitting_IO():
     gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
     assert [str(t.morph) for t in doc] == gold_morphs
     assert [t.pos_ for t in doc] == gold_pos_tags
+
+
+def test_save_activations():
+    nlp = English()
+    morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
+    train_examples = []
+    for inst in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    doc = nlp("This is a test.")
+    assert "morphologizer" not in doc.activations
+
+    morphologizer.save_activations = True
+    doc = nlp("This is a test.")
+    assert "morphologizer" in doc.activations
+    assert set(doc.activations["morphologizer"].keys()) == {
+        "label_ids",
+        "probabilities",
+    }
+    assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
+    assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 6c76558123f..2e40d86ff48 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -1,3 +1,4 @@
+from typing import cast
 import pytest
 from numpy.testing import assert_equal
 
@@ -5,6 +6,7 @@
 from spacy.attrs import SENT_START
 from spacy.lang.en import English
 from spacy.language import Language
+from spacy.pipeline import TrainablePipe
 from spacy.tests.util import make_tempdir
 from spacy.training import Example
 
@@ -101,3 +103,26 @@ def test_overfitting_IO():
     # test internal pipe labels vs. Language.pipe_labels with hidden labels
     assert nlp.get_pipe("senter").labels == ("I", "S")
     assert "senter" not in nlp.pipe_labels
+
+
+def test_save_activations():
+    # Test if activations are correctly added to Doc when requested.
+    nlp = English()
+    senter = cast(TrainablePipe, nlp.add_pipe("senter"))
+
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = senter.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "senter" not in doc.activations
+
+    senter.save_activations = True
+    doc = nlp("This is a test.")
+    assert "senter" in doc.activations
+    assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
+    assert doc.activations["senter"]["probabilities"].shape == (5, nO)
+    assert doc.activations["senter"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index c143d193fa6..9678e9b63b8 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -594,19 +594,21 @@ def test_set_candidates(name):
     assert docs[0].spans["candidates"][4].text == "Just a"
 
 
-@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
-@pytest.mark.parametrize("n_process", [1, 2])
-def test_spancat_multiprocessing(name, n_process):
-    if isinstance(get_current_ops, NumpyOps) or n_process < 2:
-        nlp = Language()
-        spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
-        train_examples = make_examples(nlp)
-        nlp.initialize(get_examples=lambda: train_examples)
-        texts = [
-            "Just a sentence.",
-            "I like London and Berlin",
-            "I like Berlin",
-            "I eat ham.",
-        ]
-        docs = list(nlp.pipe(texts, n_process=n_process))
-        assert len(docs) == len(texts)
+def test_save_activations():
+    # Test if activations are correctly added to Doc when requested.
+    nlp = English()
+    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
+    train_examples = make_examples(nlp)
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = spancat.model.get_dim("nO")
+    assert nO == 2
+    assert set(spancat.labels) == {"LOC", "PERSON"}
+
+    doc = nlp("This is a test.")
+    assert "spancat" not in doc.activations
+
+    spancat.save_activations = True
+    doc = nlp("This is a test.")
+    assert set(doc.activations["spancat"].keys()) == {"indices", "scores"}
+    assert doc.activations["spancat"]["indices"].shape == (12, 2)
+    assert doc.activations["spancat"]["scores"].shape == (12, nO)
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 4b5f1ee99fc..5deb323dd71 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,3 +1,4 @@
+from typing import cast
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import compounding, get_current_ops
@@ -6,7 +7,8 @@
 from spacy.attrs import TAG
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.training import Example
+from spacy.pipeline import TrainablePipe
+from thinc.api import compounding
 
 from ..util import make_tempdir
 
@@ -235,6 +237,26 @@ def test_overfitting_IO():
     assert doc3[0].tag_ != "N"
 
 
+def test_save_activations():
+    # Test if activations are correctly added to Doc when requested.
+    nlp = English()
+    tagger = cast(TrainablePipe, nlp.add_pipe("tagger"))
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    doc = nlp("This is a test.")
+    assert "tagger" not in doc.activations
+
+    tagger.save_activations = True
+    doc = nlp("This is a test.")
+    assert "tagger" in doc.activations
+    assert set(doc.activations["tagger"].keys()) == {"label_ids", "probabilities"}
+    assert doc.activations["tagger"]["probabilities"].shape == (5, len(TAGS))
+    assert doc.activations["tagger"]["label_ids"].shape == (5,)
+
+
 def test_tagger_requires_labels():
     nlp = English()
     nlp.add_pipe("tagger")
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 8a0c1a9760d..710dac0571d 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,3 +1,4 @@
+from typing import cast
 import random
 
 import numpy.random
@@ -11,17 +12,13 @@
 from spacy.cli.evaluate import print_prf_per_type, print_textcats_auc_per_cat
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import TextCategorizer
-from spacy.pipeline.textcat import (
-    single_label_bow_config,
-    single_label_cnn_config,
-    single_label_default_config,
-)
-from spacy.pipeline.textcat_multilabel import (
-    multi_label_bow_config,
-    multi_label_cnn_config,
-    multi_label_default_config,
-)
+from spacy.pipeline import TextCategorizer, TrainablePipe
+from spacy.pipeline.textcat import single_label_bow_config
+from spacy.pipeline.textcat import single_label_cnn_config
+from spacy.pipeline.textcat import single_label_default_config
+from spacy.pipeline.textcat_multilabel import multi_label_bow_config
+from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
+from spacy.pipeline.textcat_multilabel import multi_label_default_config
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, DocBin
@@ -298,7 +295,7 @@ def test_issue9904():
     nlp.initialize(get_examples)
 
     examples = get_examples()
-    scores = textcat.predict([eg.predicted for eg in examples])
+    scores = textcat.predict([eg.predicted for eg in examples])["probabilities"]
 
     loss = textcat.get_loss(examples, scores)[0]
     loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0]
@@ -949,24 +946,39 @@ def test_textcat_multi_threshold():
     assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
 
 
-@pytest.mark.parametrize(
-    "component_name,scorer",
-    [
-        ("textcat", "spacy.textcat_scorer.v1"),
-        ("textcat_multilabel", "spacy.textcat_multilabel_scorer.v1"),
-    ],
-)
-def test_textcat_legacy_scorers(component_name, scorer):
-    """Check that legacy scorers are registered and produce the expected score
-    keys."""
+def test_save_activations():
     nlp = English()
-    nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}})
+    textcat = cast(TrainablePipe, nlp.add_pipe("textcat"))
 
     train_examples = []
     for text, annotations in TRAIN_DATA_SINGLE_LABEL:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
     nlp.initialize(get_examples=lambda: train_examples)
+    nO = textcat.model.get_dim("nO")
 
-    # score the model (it's not actually trained but that doesn't matter)
-    scores = nlp.evaluate(train_examples)
-    assert 0 <= scores["cats_score"] <= 1
+    doc = nlp("This is a test.")
+    assert "textcat" not in doc.activations
+
+    textcat.save_activations = True
+    doc = nlp("This is a test.")
+    assert list(doc.activations["textcat"].keys()) == ["probabilities"]
+    assert doc.activations["textcat"]["probabilities"].shape == (nO,)
+
+
+def test_save_activations_multi():
+    nlp = English()
+    textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel"))
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA_MULTI_LABEL:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = textcat.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "textcat_multilabel" not in doc.activations
+
+    textcat.save_activations = True
+    doc = nlp("This is a test.")
+    assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
+    assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index d9719609cdc..5e8975ed337 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -50,6 +50,8 @@ cdef class Doc:
 
     cdef public float sentiment
 
+    cdef public dict activations
+
     cdef public dict user_hooks
     cdef public dict user_token_hooks
     cdef public dict user_span_hooks
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 0fae118b4b6..5fda6f2f789 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -16,7 +16,7 @@ from typing import (
 
 import numpy as np
 from cymem.cymem import Pool
-from thinc.types import Floats1d, Floats2d, Ints2d
+from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
 from .span import Span
 from .token import Token
 from .span_groups import SpanGroups
@@ -41,6 +41,7 @@ class Doc:
     max_length: int
     length: int
     sentiment: float
+    activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
     cats: Dict[str, float]
     user_hooks: Dict[str, Callable[..., Any]]
     user_token_hooks: Dict[str, Callable[..., Any]]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 8db8c1d6f37..497656b6570 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -281,6 +281,7 @@ cdef class Doc:
         self.length = 0
         self.sentiment = 0.0
         self.cats = {}
+        self.activations = {}
         self.user_hooks = {}
         self.user_token_hooks = {}
         self.user_span_hooks = {}
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index 0a582650076..310ce0dc88d 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -752,22 +752,23 @@ The L2 norm of the document's vector representation.
 
 ## Attributes {id="attributes"}
 
-| Name                 | Description                                                                                                                         |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `text`               | A string representation of the document text. ~~str~~                                                                               |
-| `text_with_ws`       | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                       |
-| `mem`                | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                            |
-| `vocab`              | The store of lexical types. ~~Vocab~~                                                                                               |
-| `tensor`             | Container for dense vector representations. ~~numpy.ndarray~~                                                                       |
-| `user_data`          | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                    |
-| `lang`               | Language of the document's vocabulary. ~~int~~                                                                                      |
-| `lang_`              | Language of the document's vocabulary. ~~str~~                                                                                      |
-| `sentiment`          | The document's positivity/negativity score, if available. ~~float~~                                                                 |
-| `user_hooks`         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                           |
-| `user_token_hooks`   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                   |
-| `user_span_hooks`    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                    |
-| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
-| `_`                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~       |
+| Name                                       | Description                                                                                                                                    |
+| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `text`                                     | A string representation of the document text. ~~str~~                                                                                          |
+| `text_with_ws`                             | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                                  |
+| `mem`                                      | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                       |
+| `vocab`                                    | The store of lexical types. ~~Vocab~~                                                                                                          |
+| `tensor` <Tag variant="new">2</Tag>        | Container for dense vector representations. ~~numpy.ndarray~~                                                                                  |
+| `user_data`                                | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                               |
+| `lang` <Tag variant="new">2.1</Tag>        | Language of the document's vocabulary. ~~int~~                                                                                                 |
+| `lang_` <Tag variant="new">2.1</Tag>       | Language of the document's vocabulary. ~~str~~                                                                                                 |
+| `sentiment`                                | The document's positivity/negativity score, if available. ~~float~~                                                                            |
+| `user_hooks`                               | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                      |
+| `user_token_hooks`                         | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                              |
+| `user_span_hooks`                          | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                               |
+| `has_unknown_spaces`                       | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~            |
+| `_`                                        | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                  |
+| `activations` <Tag variant="new">4.0</Tag> | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ |
 
 ## Serialization fields {id="serialization-fields"}
 
diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx
index 82967482c90..17af19e8c38 100644
--- a/website/docs/api/edittreelemmatizer.mdx
+++ b/website/docs/api/edittreelemmatizer.mdx
@@ -44,14 +44,15 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("trainable_lemmatizer", config=config, name="lemmatizer")
 > ```
 
-| Setting         | Description                                                                                                                                                                                                                                                                                                        |
-| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`         | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `backoff`       | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~                                                                                                                                                                                                                      |
-| `min_tree_freq` | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~                                                                                                                                                                                                                         |
-| `overwrite`     | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                                          |
-| `top_k`         | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~                                                                                                                                                                                                              |
-| `scorer`        | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~                                                                                                                                                                      |
+| Setting                                         | Description                                                                                                                                                                                                                                                                                                        |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                         | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `backoff`                                       | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~                                                                                                                                                                                                                      |
+| `min_tree_freq`                                 | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~                                                                                                                                                                                                                         |
+| `overwrite`                                     | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                                          |
+| `top_k`                                         | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~                                                                                                                                                                                                              |
+| `scorer`                                        | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~                                                                                                                                                                      |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"tree_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                                                                    |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/edit_tree_lemmatizer.py
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index 21d2e9015ce..85b872151fd 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -53,21 +53,20 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("entity_linker", config=config)
 > ```
 
-| Setting                                             | Description                                                                                                                                                                                                                                                                                                      |
-| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels_discard`                                    | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                                                   |
-| `n_sents`                                           | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                                                |
-| `incl_prior`                                        | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                             |
-| `incl_context`                                      | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                           |
-| `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           |
-| `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    |
-| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                                             |
-| `get_candidates`                                    | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                         |
-| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
-| `generate_empty_kb` <Tag variant="new">3.5.1</Tag>  | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           |
-| `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         |
-| `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          |
-| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                      |
+| Setting                                         | Description                                                                                                                                                                                                                                                                                 |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `labels_discard`                                | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                              |
+| `n_sents`                                       | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                           |
+| `incl_prior`                                    | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                        |
+| `incl_context`                                  | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                      |
+| `model`                                         | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                      |
+| `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
+| `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
+| `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                    |
+| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
+| `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index ce16f534219..1fda807cb32 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -42,13 +42,13 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("morphologizer", config=config)
 > ```
 
-| Setting                                        | Description                                                                                                                                                                                                                                                            |
-| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model`                                        | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
-| `overwrite` <Tag variant="new">3.2</Tag>       | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
-| `extend` <Tag variant="new">3.2</Tag>          | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
-| `scorer` <Tag variant="new">3.2</Tag>          | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
-| `label_smoothing` <Tag variant="new">3.6</Tag> | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~                                                                                                                                                                               |
+| Setting                                         | Description                                                                                                                                                                                                                                                            |
+| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model`                                         | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
+| `extend` <Tag variant="new">3.2</Tag>           | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
+| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                       |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx
@@ -400,8 +400,8 @@ coarse-grained POS as the feature `POS`.
 > assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels
 > ```
 
-| Name        | Description                                            |
-| ----------- | ------------------------------------------------------ |
+| Name        | Description                                               |
+| ----------- | --------------------------------------------------------- |
 | **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ |
 
 ## Morphologizer.label_data {id="label_data",tag="property",version="3"}
diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx
index 5435399f956..d5d096d7659 100644
--- a/website/docs/api/sentencerecognizer.mdx
+++ b/website/docs/api/sentencerecognizer.mdx
@@ -39,11 +39,12 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("senter", config=config)
 > ```
 
-| Setting                                  | Description                                                                                                                                                           |
-| ---------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                             |
-| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~                                   |
+| Setting                                         | Description                                                                                                                                                           |
+| ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model`                                         | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                             |
+| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~                                   |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                      |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/senter.pyx
diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx
index 98a1948eeab..258db794786 100644
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@@ -62,32 +62,15 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("spancat", config=config)
 > ```
 
-> #### Example (spancat_singlelabel)
->
-> ```python
-> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
-> config = {
->     "spans_key": "labeled_spans",
->     "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
->     "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
->     # Additional spancat_singlelabel parameters
->     "negative_weight": 0.8,
->     "allow_overlap": True,
-> }
-> nlp.add_pipe("spancat_singlelabel", config=config)
-> ```
-
-| Setting                                             | Description                                                                                                                                                                                                                                                                                             |
-| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `suggester`                                         | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~                                                  |
-| `model`                                             | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
-| `spans_key`                                         | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~                                                                                  |
-| `threshold`                                         | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Meant to be used in combination with the multi-class `spancat` component with a `Logistic` scoring layer. Defaults to `0.5`. ~~float~~                                                |
-| `max_positive`                                      | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. Meant to be used together with the `spancat` component and defaults to 0 with `spancat_singlelabel`. ~~Optional[int]~~                                                                                 |
-| `scorer`                                            | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~                                                                                                                                       |
-| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span` . This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel`. Spans with negative labels and their scores are not stored as annotations. ~~bool~~            |
-| `negative_weight` <Tag variant="new">3.5.1</Tag>    | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~                                                                                                               |
-| `allow_overlap` <Tag variant="new">3.5.1</Tag>      | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~                                                                                                                                                        |
+| Setting                                         | Description                                                                                                                                                                                                                                                                                             |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `suggester`                                     | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~                                                  |
+| `model`                                         | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
+| `spans_key`                                     | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~                                                                                  |
+| `threshold`                                     | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~                                                                                                                                                          |
+| `max_positive`                                  | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~                                                                                                                                                                                      |
+| `scorer`                                        | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~                                                                                                                                       |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"indices"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                                 |
 
 <Infobox variant="warning">
 
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index d9b0506fb17..20852e8eb94 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -40,13 +40,13 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("tagger", config=config)
 > ```
 
-| Setting                                        | Description                                                                                                                                                                                                                                                                                            |
-| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                                        | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `overwrite` <Tag variant="new">3.2</Tag>       | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                              |
-| `scorer` <Tag variant="new">3.2</Tag>          | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~                                                                                                                                                            |
-| `neg_prefix` <Tag variant="new">3.2.1</Tag>    | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~                                                                                                                                                              |
-| `label_smoothing` <Tag variant="new">3.6</Tag> | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~                                                                                                                                                                                                               |
+| Setting                                         | Description                                                                                                                                                                                                                                                                                            |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                         | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                              |
+| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~                                                                                                                                                            |
+| `neg_prefix` <Tag variant="new">3.2.1</Tag>     | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~                                                                                                                                                              |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                                                       |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/tagger.pyx
diff --git a/website/docs/api/textcategorizer.mdx b/website/docs/api/textcategorizer.mdx
index a259b7b3c65..a1dfb6dd88e 100644
--- a/website/docs/api/textcategorizer.mdx
+++ b/website/docs/api/textcategorizer.mdx
@@ -116,14 +116,15 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#create_pipe).
 
-| Name           | Description                                                                                                                      |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                                 |
-| `model`        | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~       |
-| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
-| _keyword-only_ |                                                                                                                                  |
-| `threshold`    | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~        |
-| `scorer`       | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
+| Name                                            | Description                                                                                                                      |
+| ----------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                         | The shared vocabulary. ~~Vocab~~                                                                                                 |
+| `model`                                         | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~       |
+| `name`                                          | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
+| _keyword-only_                                  |                                                                                                                                  |
+| `threshold`                                     | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                   |
+| `scorer`                                        | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. The supported activations is `"probabilities"`. ~~Union[bool, list[str]]~~            |
 
 ## TextCategorizer.\_\_call\_\_ {id="call",tag="method"}
 

From 93b953eec0b0bae80001dadd12198103d052df70 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 15 Sep 2022 17:06:58 +0200
Subject: [PATCH 129/504] disable mypy run for Python 3.10 (#11508) (#11512)

---
 .github/azure-steps.yml | 117 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 .github/azure-steps.yml

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
new file mode 100644
index 00000000000..c7722391fec
--- /dev/null
+++ b/.github/azure-steps.yml
@@ -0,0 +1,117 @@
+parameters:
+  python_version: ''
+  architecture: ''
+  prefix: ''
+  gpu: false
+  num_build_jobs: 1
+
+steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: ${{ parameters.python_version }}
+      architecture: ${{ parameters.architecture }}
+
+  - bash: |
+      echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
+    displayName: 'Set variables'
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U pip setuptools
+      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+    displayName: "Install dependencies"
+
+  - script: |
+      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
+      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
+    displayName: "Compile and build sdist"
+
+  - script: python -m mypy spacy
+    displayName: 'Run mypy'
+    condition: ne(variables['python_version'], '3.10')
+
+  - task: DeleteFiles@1
+    inputs:
+      contents: "spacy"
+    displayName: "Delete source directory"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
+      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+    displayName: "Uninstall all packages"
+
+  - bash: |
+      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+      ${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
+    displayName: "Install from sdist"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+    displayName: "Install test requirements"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
+      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
+    displayName: "Install GPU requirements"
+    condition: eq(${{ parameters.gpu }}, true)
+
+  - script: |
+      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
+    displayName: "Run CPU tests"
+    condition: eq(${{ parameters.gpu }}, false)
+
+  - script: |
+      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
+    displayName: "Run GPU tests"
+    condition: eq(${{ parameters.gpu }}, true)
+
+  - script: |
+      python -m spacy download ca_core_news_sm
+      python -m spacy download ca_core_news_md
+      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+    displayName: 'Test download CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
+    displayName: 'Test convert CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -m spacy init config -p ner -l ca ner.cfg
+      python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
+    displayName: 'Test debug config CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      # will have errors due to sparse data, check for summary in output
+      python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
+    displayName: 'Test debug data CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
+    displayName: 'Test train CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+    displayName: 'Test assemble CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+    displayName: 'Test assemble CLI vectors warning'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python .github/validate_universe_json.py website/meta/universe.json
+    displayName: 'Test website/meta/universe.json'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
+      ${{ parameters.prefix }} python -m pytest --pyargs spacy
+    displayName: "Run CPU tests with thinc-apple-ops"
+    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))

From cbda71f0767137f1ecb2a561e57475e2262fc18b Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 3 Oct 2022 14:41:15 +0200
Subject: [PATCH 130/504] fix test for EL activations with refactored KB

---
 spacy/tests/pipeline/test_entity_linker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 844bacb3b1f..80b6e766347 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1209,7 +1209,7 @@ def create_kb(vocab):
         # create artificial KB - assign same prior weight to the two russ cochran's
         # Q2146908 (Russ Cochran): American golfer
         # Q7381115 (Russ Cochran): publisher
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
         mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
         mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
         mykb.add_alias(

From 01b1c33afaa5d849a7edea75bb268c0b2e8d51f8 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 6 Oct 2022 10:51:06 +0200
Subject: [PATCH 131/504] `StringStore` refactoring (#11344)

* `strings`: Remove unused `hash32_utf8` function

* `strings`: Make `hash_utf8` and `decode_Utf8Str` private

* `strings`: Reorganize private functions

* 'strings': Raise error when non-string/-int types are passed to functions that don't accept them

* `strings`: Add `items()` method, add type hints, remove unused methods, restrict inputs to specific types, reorganize methods

* `Morphology`: Use `StringStore.items()` to enumerate features when pickling

* `test_stringstore`: Update pre-Python 3 tests

* Update `StringStore` docs

* Fix `get_string_id` imports

* Replace redundant test with tests for type checking

* Rename `_retrieve_interned_str`, remove `.get` default arg

* Add `get_string_id` to `strings.pyi`
Remove `mypy` ignore directives from imports of the above

* `strings.pyi`: Replace functions that consume `Union`-typed params with overloads

* `strings.pyi`: Revert some function signatures

* Update `SYMBOLS_BY_INT` lookups and error codes post-merge

* Revert clobbered change introduced in a previous merge

* Remove unnecessary type hint

* Invert tuple order in `StringStore.items()`

* Add test for `StringStore.items()`

* Revert "`Morphology`: Use `StringStore.items()` to enumerate features when pickling"

This reverts commit 1af9510ceb6b08cfdcfbf26df6896f26709fac0d.

* Rename `keys` and `key_map`

* Add `keys()` and `values()`

* Add comment about the inverted key-value semantics in the API

* Fix type hints

* Implement `keys()`, `values()`, `items()` without generators

* Fix type hints, remove unnecessary boxing

* Update docs

* Simplify `keys/values/items()` impl

* `mypy` fix

* Fix error message, doc fixes
---
 spacy/errors.py                               |   4 +-
 spacy/matcher/matcher.pyx                     |   3 +
 spacy/strings.pxd                             |  22 +-
 spacy/strings.pyi                             |  22 +-
 spacy/strings.pyx                             | 410 +++++++++---------
 spacy/tests/vocab_vectors/test_stringstore.py |  41 +-
 spacy/tokens/graph.pyx                        |   4 +-
 spacy/tokens/retokenizer.pyx                  |   4 +-
 website/docs/api/stringstore.mdx              |  82 +++-
 9 files changed, 334 insertions(+), 258 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 146c60b6d60..9814679eb7d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -258,7 +258,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
     E016 = ("MultitaskObjective target should be function or one of: dep, "
             "tag, ent, dep_tag_offset, ent_tag.")
-    E017 = ("Can only add unicode or bytes. Got type: {value_type}")
+    E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
     E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
             "refers to an issue with the `Vocab` or `StringStore`.")
     E019 = ("Can't create transition with unknown action ID: {action}. Action "
@@ -991,6 +991,8 @@ class Errors(metaclass=ErrorsWithCodes):
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
+    E4001 = ("Expected input to be one of the following types: ({expected_types}), "
+             "but got '{received_type}'")
 
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 715dd45f07c..7e734ac247e 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -20,6 +20,9 @@ from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
 
+from ..schemas import validate_token_pattern
+from ..errors import Errors, MatchPatternError, Warnings
+from ..strings cimport get_string_id
 from ..attrs import IDS
 from ..errors import Errors, MatchPatternError, Warnings
 from ..schemas import validate_token_pattern
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index d22f48ba133..b734a707c54 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,3 +1,6 @@
+from libc.stdint cimport int64_t, uint32_t
+from libcpp.vector cimport vector
+from libcpp.set cimport set
 from cymem.cymem cimport Pool
 from libc.stdint cimport int64_t
 from libcpp.set cimport set
@@ -7,13 +10,6 @@ from preshed.maps cimport PreshMap
 
 from .typedefs cimport attr_t, hash_t
 
-
-cpdef hash_t hash_string(str string) except 0
-cdef hash_t hash_utf8(char* utf8_string, int length) nogil
-
-cdef str decode_Utf8Str(const Utf8Str* string)
-
-
 ctypedef union Utf8Str:
     unsigned char[8] s
     unsigned char* p
@@ -21,9 +17,13 @@ ctypedef union Utf8Str:
 
 cdef class StringStore:
     cdef Pool mem
+    cdef vector[hash_t] _keys
+    cdef PreshMap _map
+
+    cdef hash_t _intern_str(self, str string)
+    cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *
+    cdef str _decode_str_repr(self, const Utf8Str* string)
 
-    cdef vector[hash_t] keys
-    cdef public PreshMap _map
 
-    cdef const Utf8Str* intern_unicode(self, str py_string)
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
+cpdef hash_t hash_string(object string) except -1
+cpdef hash_t get_string_id(object string_or_hash) except -1
diff --git a/spacy/strings.pyi b/spacy/strings.pyi
index f8fe8381c87..8b7c0d6bd5a 100644
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@@ -1,21 +1,21 @@
+from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overload
 from pathlib import Path
 from typing import Any, Iterable, Iterator, Optional, Union, overload
 
-def get_string_id(key: Union[str, int]) -> int: ...
-
 class StringStore:
-    def __init__(
-        self, strings: Optional[Iterable[str]] = ..., freeze: bool = ...
-    ) -> None: ...
+    def __init__(self, strings: Optional[Iterable[str]]) -> None: ...
     @overload
-    def __getitem__(self, string_or_id: Union[bytes, str]) -> int: ...
+    def __getitem__(self, string_or_hash: str) -> int: ...
     @overload
-    def __getitem__(self, string_or_id: int) -> str: ...
-    def as_int(self, key: Union[bytes, str, int]) -> int: ...
-    def as_string(self, key: Union[bytes, str, int]) -> str: ...
+    def __getitem__(self, string_or_hash: int) -> str: ...
+    def as_int(self, string_or_hash: Union[str, int]) -> int: ...
+    def as_string(self, string_or_hash: Union[str, int]) -> str: ...
     def add(self, string: str) -> int: ...
+    def items(self) -> List[Tuple[str, int]]: ...
+    def keys(self) -> List[str]: ...
+    def values(self) -> List[int]: ...
     def __len__(self) -> int: ...
-    def __contains__(self, string: str) -> bool: ...
+    def __contains__(self, string_or_hash: Union[str, int]) -> bool: ...
     def __iter__(self) -> Iterator[str]: ...
     def __reduce__(self) -> Any: ...
     def to_disk(self, path: Union[str, Path]) -> None: ...
@@ -23,3 +23,5 @@ class StringStore:
     def to_bytes(self, **kwargs: Any) -> bytes: ...
     def from_bytes(self, bytes_data: bytes, **kwargs: Any) -> StringStore: ...
     def _reset_and_load(self, strings: Iterable[str]) -> None: ...
+
+def get_string_id(string_or_hash: Union[str, int]) -> int: ...
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index e73b66dff54..73e4c46ed46 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,9 +1,8 @@
 # cython: infer_types=True
-# cython: profile=False
+from typing import Optional, Union, Iterable, Tuple, Callable, Any, List, Iterator
 cimport cython
 from libc.stdint cimport uint32_t
-from libc.string cimport memcpy
-from murmurhash.mrmr cimport hash32, hash64
+from murmurhash.mrmr cimport hash64
 
 import srsly
 
@@ -15,105 +14,13 @@ from .symbols import IDS as SYMBOLS_BY_STR
 from .symbols import NAMES as SYMBOLS_BY_INT
 
 
-# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
-cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
-    try:
-        out_hash[0] = key
-        return True
-    except:  # no-cython-lint
-        return False
-
-
-def get_string_id(key):
-    """Get a string ID, handling the reserved symbols correctly. If the key is
-    already an ID, return it.
-
-    This function optimises for convenience over performance, so shouldn't be
-    used in tight loops.
-    """
-    cdef hash_t str_hash    
-    if isinstance(key, str):
-        if len(key) == 0:
-            return 0
-
-        symbol = SYMBOLS_BY_STR.get(key, None)
-        if symbol is not None:
-            return symbol
-        else:
-            chars = key.encode("utf8")
-            return hash_utf8(chars, len(chars))
-    elif _try_coerce_to_hash(key, &str_hash):
-        # Coerce the integral key to the expected primitive hash type.
-        # This ensures that custom/overloaded "primitive" data types
-        # such as those implemented by numpy are not inadvertently used 
-        # downsteam (as these are internally implemented as custom PyObjects 
-        # whose comparison operators can incur a significant overhead).
-        return str_hash
-    else:
-        # TODO: Raise an error instead
-        return key
-
-
-cpdef hash_t hash_string(str string) except 0:
-    chars = string.encode("utf8")
-    return hash_utf8(chars, len(chars))
-
-
-cdef hash_t hash_utf8(char* utf8_string, int length) nogil:
-    return hash64(utf8_string, length, 1)
-
-
-cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
-    return hash32(utf8_string, length, 1)
-
-
-cdef str decode_Utf8Str(const Utf8Str* string):
-    cdef int i, length
-    if string.s[0] < sizeof(string.s) and string.s[0] != 0:
-        return string.s[1:string.s[0]+1].decode("utf8")
-    elif string.p[0] < 255:
-        return string.p[1:string.p[0]+1].decode("utf8")
-    else:
-        i = 0
-        length = 0
-        while string.p[i] == 255:
-            i += 1
-            length += 255
-        length += string.p[i]
-        i += 1
-        return string.p[i:length + i].decode("utf8")
-
-
-cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
-    cdef int n_length_bytes
-    cdef int i
-    cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
-    if length < sizeof(string.s):
-        string.s[0] = <unsigned char>length
-        memcpy(&string.s[1], chars, length)
-        return string
-    elif length < 255:
-        string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
-        string.p[0] = length
-        memcpy(&string.p[1], chars, length)
-        return string
-    else:
-        i = 0
-        n_length_bytes = (length // 255) + 1
-        string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
-        for i in range(n_length_bytes-1):
-            string.p[i] = 255
-        string.p[n_length_bytes-1] = length % 255
-        memcpy(&string.p[n_length_bytes], chars, length)
-        return string
-
 
 cdef class StringStore:
-    """Look up strings by 64-bit hashes.
+    """Look up strings by 64-bit hashes. Implicitly handles reserved symbols.
 
     DOCS: https://spacy.io/api/stringstore
     """
-    def __init__(self, strings=None, freeze=False):
+    def __init__(self, strings: Optional[Iterable[str]] = None):
         """Create the StringStore.
 
         strings (iterable): A sequence of unicode strings to add to the store.
@@ -124,127 +31,126 @@ cdef class StringStore:
             for string in strings:
                 self.add(string)
 
-    def __getitem__(self, object string_or_id):
-        """Retrieve a string from a given hash, or vice versa.
+    def __getitem__(self, string_or_hash: Union[str, int]) -> Union[str, int]:
+        """Retrieve a string from a given hash. If a string
+        is passed as the input, add it to the store and return
+        its hash.
 
-        string_or_id (bytes, str or uint64): The value to encode.
-        Returns (str / uint64): The value to be retrieved.
+        string_or_hash (int / str): The hash value to lookup or the string to store.
+        RETURNS (str / int): The stored string or the hash of the newly added string.
         """
-        cdef hash_t str_hash
-        cdef Utf8Str* utf8str = NULL
-
-        if isinstance(string_or_id, str):
-            if len(string_or_id) == 0:
-                return 0
-
-            # Return early if the string is found in the symbols LUT.
-            symbol = SYMBOLS_BY_STR.get(string_or_id, None)
-            if symbol is not None:
-                return symbol
-            else:
-                return hash_string(string_or_id)
-        elif isinstance(string_or_id, bytes):
-            return hash_utf8(string_or_id, len(string_or_id))
-        elif _try_coerce_to_hash(string_or_id, &str_hash):
-            if str_hash == 0:
-                return ""
-            elif str_hash in SYMBOLS_BY_INT:
-                return SYMBOLS_BY_INT[str_hash]
-            else:
-                utf8str = <Utf8Str*>self._map.get(str_hash)
+        if isinstance(string_or_hash, str):
+            return self.add(string_or_hash)
         else:
-            # TODO: Raise an error instead
-            utf8str = <Utf8Str*>self._map.get(string_or_id)
+            return self._get_interned_str(string_or_hash)
 
-        if utf8str is NULL:
-            raise KeyError(Errors.E018.format(hash_value=string_or_id))
-        else:
-            return decode_Utf8Str(utf8str)
+    def __contains__(self, string_or_hash: Union[str, int]) -> bool:
+        """Check whether a string or a hash is in the store.
 
-    def as_int(self, key):
-        """If key is an int, return it; otherwise, get the int value."""
-        if not isinstance(key, str):
-            return key
+        string (str / int): The string/hash to check.
+        RETURNS (bool): Whether the store contains the string.
+        """
+        cdef hash_t str_hash = get_string_id(string_or_hash)
+        if str_hash in SYMBOLS_BY_INT:
+            return True
         else:
-            return self[key]
+            return self._map.get(str_hash) is not NULL
 
-    def as_string(self, key):
-        """If key is a string, return it; otherwise, get the string value."""
-        if isinstance(key, str):
-            return key
-        else:
-            return self[key]
+    def __iter__(self) -> Iterator[str]:
+        """Iterate over the strings in the store in insertion order.
+
+        RETURNS: An iterable collection of strings.
+        """
+        return iter(self.keys())
+
+    def __reduce__(self):
+        strings = list(self)
+        return (StringStore, (strings,), None, None, None)
+
+    def __len__(self) -> int:
+        """The number of strings in the store.
+
+        RETURNS (int): The number of strings in the store.
+        """
+        return self._keys.size()
 
-    def add(self, string):
+    def add(self, string: str) -> int:
         """Add a string to the StringStore.
 
         string (str): The string to add.
         RETURNS (uint64): The string's hash value.
         """
-        cdef hash_t str_hash
-        if isinstance(string, str):
-            if string in SYMBOLS_BY_STR:
-                return SYMBOLS_BY_STR[string]
-
-            string = string.encode("utf8")
-            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
-        elif isinstance(string, bytes):
-            if string in SYMBOLS_BY_STR:
-                return SYMBOLS_BY_STR[string]
-            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
-        else:
+        if not isinstance(string, str):
             raise TypeError(Errors.E017.format(value_type=type(string)))
-        return str_hash
 
-    def __len__(self):
-        """The number of strings in the store.
+        if string in SYMBOLS_BY_STR:
+            return SYMBOLS_BY_STR[string]
+        else:
+            return self._intern_str(string)
 
-        RETURNS (int): The number of strings in the store.
+    def as_int(self, string_or_hash: Union[str, int]) -> str:
+        """If a hash value is passed as the input, return it as-is. If the input
+        is a string, return its corresponding hash.
+
+        string_or_hash (str / int): The string to hash or a hash value.
+        RETURNS (int): The hash of the string or the input hash value.
         """
-        return self.keys.size()
+        if isinstance(string_or_hash, int):
+            return string_or_hash
+        else:
+            return get_string_id(string_or_hash)
 
-    def __contains__(self, string_or_id not None):
-        """Check whether a string or ID is in the store.
+    def as_string(self, string_or_hash: Union[str, int]) -> str:
+        """If a string is passed as the input, return it as-is. If the input
+        is a hash value, return its corresponding string.
 
-        string_or_id (str or int): The string to check.
-        RETURNS (bool): Whether the store contains the string.
+        string_or_hash (str / int): The hash value to lookup or a string.
+        RETURNS (str): The stored string or the input string.
         """
-        cdef hash_t str_hash
-        if isinstance(string_or_id, str):
-            if len(string_or_id) == 0:
-                return True
-            elif string_or_id in SYMBOLS_BY_STR:
-                return True
-            str_hash = hash_string(string_or_id)
-        elif _try_coerce_to_hash(string_or_id, &str_hash):
-            pass
+        if isinstance(string_or_hash, str):
+            return string_or_hash
         else:
-            # TODO: Raise an error instead
-            return self._map.get(string_or_id) is not NULL
+            return self._get_interned_str(string_or_hash)
 
-        if str_hash in SYMBOLS_BY_INT:
-            return True
-        else:
-            return self._map.get(str_hash) is not NULL
+    def items(self) -> List[Tuple[str, int]]:
+        """Iterate over the stored strings and their hashes in insertion order.
 
-    def __iter__(self):
-        """Iterate over the strings in the store, in order.
+        RETURNS: A list of string-hash pairs.
+        """
+        # Even though we internally store the hashes as keys and the strings as
+        # values, we invert the order in the public API to keep it consistent with
+        # the implementation of the `__iter__` method (where we wish to iterate over
+        # the strings in the store).
+        cdef int i
+        pairs = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            str_hash = self._keys[i]
+            utf8str = <Utf8Str*>self._map.get(str_hash)
+            pairs[i] = (self._decode_str_repr(utf8str), str_hash)
+        return pairs
+
+    def keys(self) -> List[str]:
+        """Iterate over the stored strings in insertion order.
 
-        YIELDS (str): A string in the store.
+        RETURNS: A list of strings.
         """
         cdef int i
-        cdef hash_t key
-        for i in range(self.keys.size()):
-            key = self.keys[i]
-            utf8str = <Utf8Str*>self._map.get(key)
-            yield decode_Utf8Str(utf8str)
-        # TODO: Iterate OOV here?
+        strings = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            utf8str = <Utf8Str*>self._map.get(self._keys[i])
+            strings[i] = self._decode_str_repr(utf8str)
+        return strings
 
-    def __reduce__(self):
-        strings = list(self)
-        return (StringStore, (strings,), None, None, None)
+    def values(self) -> List[int]:
+        """Iterate over the stored strings hashes in insertion order.
+
+        RETURNS: A list of string hashs.
+        """
+        cdef int i
+        hashes = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            hashes[i] = self._keys[i]
+        return hashes
 
     def to_disk(self, path):
         """Save the current state to a directory.
@@ -295,24 +201,122 @@ cdef class StringStore:
     def _reset_and_load(self, strings):
         self.mem = Pool()
         self._map = PreshMap()
-        self.keys.clear()
+        self._keys.clear()
         for string in strings:
             self.add(string)
 
-    cdef const Utf8Str* intern_unicode(self, str py_string):
-        # 0 means missing, but we don't bother offsetting the index.
-        cdef bytes byte_string = py_string.encode("utf8")
-        return self._intern_utf8(byte_string, len(byte_string), NULL)
+    def _get_interned_str(self, hash_value: int) -> str:
+        cdef hash_t str_hash
+        if not _try_coerce_to_hash(hash_value, &str_hash):
+            raise TypeError(Errors.E4001.format(expected_types="'int'", received_type=type(hash_value)))
+
+        # Handle reserved symbols and empty strings correctly.
+        if str_hash == 0:
+            return ""
 
-    @cython.final
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
+        symbol = SYMBOLS_BY_INT.get(str_hash)
+        if symbol is not None:
+            return symbol
+
+        utf8str = <Utf8Str*>self._map.get(str_hash)
+        if utf8str is NULL:
+            raise KeyError(Errors.E018.format(hash_value=str_hash))
+        else:
+            return self._decode_str_repr(utf8str)
+
+    cdef hash_t _intern_str(self, str string):
         # TODO: This function's API/behaviour is an unholy mess...
         # 0 means missing, but we don't bother offsetting the index.
-        cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
+        chars = string.encode('utf-8')
+        cdef hash_t key = hash64(<unsigned char*>chars, len(chars), 1)
         cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
         if value is not NULL:
-            return value
-        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
+            return key
+
+        value = self._allocate_str_repr(<unsigned char*>chars, len(chars))
         self._map.set(key, value)
-        self.keys.push_back(key)
-        return value
+        self._keys.push_back(key)
+        return key
+
+    cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *:
+        cdef int n_length_bytes
+        cdef int i
+        cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
+        cdef uint32_t ulength = length
+        if length < sizeof(string.s):
+            string.s[0] = <unsigned char>length
+            memcpy(&string.s[1], chars, length)
+            return string
+        elif length < 255:
+            string.p = <unsigned char*>self.mem.alloc(length + 1, sizeof(unsigned char))
+            string.p[0] = length
+            memcpy(&string.p[1], chars, length)
+            return string
+        else:
+            i = 0
+            n_length_bytes = (length // 255) + 1
+            string.p = <unsigned char*>self.mem.alloc(length + n_length_bytes, sizeof(unsigned char))
+            for i in range(n_length_bytes-1):
+                string.p[i] = 255
+            string.p[n_length_bytes-1] = length % 255
+            memcpy(&string.p[n_length_bytes], chars, length)
+            return string
+
+    cdef str _decode_str_repr(self, const Utf8Str* string):
+        cdef int i, length
+        if string.s[0] < sizeof(string.s) and string.s[0] != 0:
+            return string.s[1:string.s[0]+1].decode('utf-8')
+        elif string.p[0] < 255:
+            return string.p[1:string.p[0]+1].decode('utf-8')
+        else:
+            i = 0
+            length = 0
+            while string.p[i] == 255:
+                i += 1
+                length += 255
+            length += string.p[i]
+            i += 1
+            return string.p[i:length + i].decode('utf-8')
+
+
+cpdef hash_t hash_string(object string) except -1:
+    if not isinstance(string, str):
+        raise TypeError(Errors.E4001.format(expected_types="'str'", received_type=type(string)))
+
+    # Handle reserved symbols and empty strings correctly.
+    if len(string) == 0:
+        return 0
+
+    symbol = SYMBOLS_BY_STR.get(string)
+    if symbol is not None:
+        return symbol
+
+    chars = string.encode('utf-8')
+    return hash64(<unsigned char*>chars, len(chars), 1)
+
+
+cpdef hash_t get_string_id(object string_or_hash) except -1:
+    cdef hash_t str_hash
+
+    try:
+        return hash_string(string_or_hash)
+    except:
+        if _try_coerce_to_hash(string_or_hash, &str_hash):
+            # Coerce the integral key to the expected primitive hash type.
+            # This ensures that custom/overloaded "primitive" data types
+            # such as those implemented by numpy are not inadvertently used
+            # downsteam (as these are internally implemented as custom PyObjects
+            # whose comparison operators can incur a significant overhead).
+            return str_hash
+        else:
+            raise TypeError(Errors.E4001.format(expected_types="'str','int'", received_type=type(string_or_hash)))
+
+
+# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
+cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
+    try:
+        out_hash[0] = key
+        return True
+    except:
+        return False
+
diff --git a/spacy/tests/vocab_vectors/test_stringstore.py b/spacy/tests/vocab_vectors/test_stringstore.py
index 61039fffd4c..68c307939d3 100644
--- a/spacy/tests/vocab_vectors/test_stringstore.py
+++ b/spacy/tests/vocab_vectors/test_stringstore.py
@@ -25,6 +25,14 @@ def test_stringstore_from_api_docs(stringstore):
     stringstore.add("orange")
     all_strings = [s for s in stringstore]
     assert all_strings == ["apple", "orange"]
+    assert all_strings == list(stringstore.keys())
+    all_strings_and_hashes = list(stringstore.items())
+    assert all_strings_and_hashes == [
+        ("apple", 8566208034543834098),
+        ("orange", 2208928596161743350),
+    ]
+    all_hashes = list(stringstore.values())
+    assert all_hashes == [8566208034543834098, 2208928596161743350]
     banana_hash = stringstore.add("banana")
     assert len(stringstore) == 3
     assert banana_hash == 2525716904149915114
@@ -32,12 +40,25 @@ def test_stringstore_from_api_docs(stringstore):
     assert stringstore["banana"] == banana_hash
 
 
-@pytest.mark.parametrize("text1,text2,text3", [(b"Hello", b"goodbye", b"hello")])
-def test_stringstore_save_bytes(stringstore, text1, text2, text3):
-    key = stringstore.add(text1)
-    assert stringstore[text1] == key
-    assert stringstore[text2] != key
-    assert stringstore[text3] != key
+@pytest.mark.parametrize(
+    "val_bytes,val_float,val_list,val_text,val_hash",
+    [(b"Hello", 1.1, ["abc"], "apple", 8566208034543834098)],
+)
+def test_stringstore_type_checking(
+    stringstore, val_bytes, val_float, val_list, val_text, val_hash
+):
+    with pytest.raises(TypeError):
+        assert stringstore[val_bytes]
+
+    with pytest.raises(TypeError):
+        stringstore.add(val_float)
+
+    with pytest.raises(TypeError):
+        assert val_list not in stringstore
+
+    key = stringstore.add(val_text)
+    assert val_hash == key
+    assert stringstore[val_hash] == val_text
 
 
 @pytest.mark.parametrize("text1,text2,text3", [("Hello", "goodbye", "hello")])
@@ -48,19 +69,19 @@ def test_stringstore_save_unicode(stringstore, text1, text2, text3):
     assert stringstore[text3] != key
 
 
-@pytest.mark.parametrize("text", [b"A"])
+@pytest.mark.parametrize("text", ["A"])
 def test_stringstore_retrieve_id(stringstore, text):
     key = stringstore.add(text)
     assert len(stringstore) == 1
-    assert stringstore[key] == text.decode("utf8")
+    assert stringstore[key] == text
     with pytest.raises(KeyError):
         stringstore[20000]
 
 
-@pytest.mark.parametrize("text1,text2", [(b"0123456789", b"A")])
+@pytest.mark.parametrize("text1,text2", [("0123456789", "A")])
 def test_stringstore_med_string(stringstore, text1, text2):
     store = stringstore.add(text1)
-    assert stringstore[store] == text1.decode("utf8")
+    assert stringstore[store] == text1
     stringstore.add(text2)
     assert stringstore[text1] == store
 
diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx
index 6c4ce6ce358..22ce18181a7 100644
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@@ -16,9 +16,7 @@ from murmurhash.mrmr cimport hash64
 from .. import Errors
 
 from ..typedefs cimport hash_t
-
-from ..strings import get_string_id
-
+from ..strings cimport get_string_id
 from ..structs cimport EdgeC, GraphC
 
 from .token import Token
diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx
index b0e4ff85c9f..d3e9c5674cc 100644
--- a/spacy/tokens/retokenizer.pyx
+++ b/spacy/tokens/retokenizer.pyx
@@ -15,9 +15,7 @@ from .token cimport Token
 
 from ..attrs import intify_attrs
 from ..errors import Errors
-from ..strings import get_string_id
-from ..util import SimpleFrozenDict
-from .underscore import is_writable_attr
+from ..strings cimport get_string_id
 
 
 cdef class Retokenizer:
diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index 6a3e9d6644e..d4d85e6d56a 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -47,7 +47,8 @@ Get the number of strings in the store.
 
 ## StringStore.\_\_getitem\_\_ {id="getitem",tag="method"}
 
-Retrieve a string from a given hash, or vice versa.
+Retrieve a string from a given hash. If a string is passed as the input, add it
+to the store and return its hash.
 
 > #### Example
 >
@@ -58,14 +59,14 @@ Retrieve a string from a given hash, or vice versa.
 > assert stringstore[apple_hash] == "apple"
 > ```
 
-| Name           | Description                                     |
-| -------------- | ----------------------------------------------- |
-| `string_or_id` | The value to encode. ~~Union[bytes, str, int]~~ |
-| **RETURNS**    | The value to be retrieved. ~~Union[str, int]~~  |
+| Name             | Description                                                                  |
+| ---------------- | ---------------------------------------------------------------------------- |
+| `string_or_hash` | The hash value to lookup or the string to store. ~~Union[str, int]~~         |
+| **RETURNS**      | The stored string or the hash of the newly added string. ~~Union[str, int]~~ |
 
 ## StringStore.\_\_contains\_\_ {id="contains",tag="method"}
 
-Check whether a string is in the store.
+Check whether a string or a hash is in the store.
 
 > #### Example
 >
@@ -75,15 +76,14 @@ Check whether a string is in the store.
 > assert not "cherry" in stringstore
 > ```
 
-| Name        | Description                                     |
-| ----------- | ----------------------------------------------- |
-| `string`    | The string to check. ~~str~~                    |
-| **RETURNS** | Whether the store contains the string. ~~bool~~ |
+| Name             | Description                                             |
+| ---------------- | ------------------------------------------------------- |
+| `string_or_hash` | The string or hash to check. ~~Union[str, int]~~        |
+| **RETURNS**      | Whether the store contains the string or hash. ~~bool~~ |
 
 ## StringStore.\_\_iter\_\_ {id="iter",tag="method"}
 
-Iterate over the strings in the store, in order. Note that a newly initialized
-store will always include an empty string `""` at position `0`.
+Iterate over the stored strings in insertion order.
 
 > #### Example
 >
@@ -93,11 +93,59 @@ store will always include an empty string `""` at position `0`.
 > assert all_strings == ["apple", "orange"]
 > ```
 
-| Name       | Description                    |
-| ---------- | ------------------------------ |
-| **YIELDS** | A string in the store. ~~str~~ |
+| Name        | Description                    |
+| ----------- | ------------------------------ |
+| **RETURNS** | A string in the store. ~~str~~ |
 
-## StringStore.add {id="add",tag="method",version="2"}
+## StringStore.items {#iter tag="method" new="4"}
+
+Iterate over the stored string-hash pairs in insertion order.
+
+> #### Example
+>
+> ```python
+> stringstore = StringStore(["apple", "orange"])
+> all_strings_and_hashes = stringstore.items()
+> assert all_strings_and_hashes == [("apple", 8566208034543834098), ("orange", 2208928596161743350)]
+> ```
+
+| Name        | Description                                            |
+| ----------- | ------------------------------------------------------ |
+| **RETURNS** | A list of string-hash pairs. ~~List[Tuple[str, int]]~~ |
+
+## StringStore.keys {#iter tag="method" new="4"}
+
+Iterate over the stored strings in insertion order.
+
+> #### Example
+>
+> ```python
+> stringstore = StringStore(["apple", "orange"])
+> all_strings = stringstore.keys()
+> assert all_strings == ["apple", "orange"]
+> ```
+
+| Name        | Description                      |
+| ----------- | -------------------------------- |
+| **RETURNS** | A list of strings. ~~List[str]~~ |
+
+## StringStore.values {#iter tag="method" new="4"}
+
+Iterate over the stored string hashes in insertion order.
+
+> #### Example
+>
+> ```python
+> stringstore = StringStore(["apple", "orange"])
+> all_hashes = stringstore.values()
+> assert all_hashes == [8566208034543834098, 2208928596161743350]
+> ```
+
+| Name        | Description                            |
+| ----------- | -------------------------------------- |
+| **RETURNS** | A list of string hashes. ~~List[int]~~ |
+
+## StringStore.add {#add tag="method"}
 
 Add a string to the `StringStore`.
 
@@ -117,7 +165,7 @@ Add a string to the `StringStore`.
 | `string`    | The string to add. ~~str~~       |
 | **RETURNS** | The string's hash value. ~~int~~ |
 
-## StringStore.to_disk {id="to_disk",tag="method",version="2"}
+## StringStore.to_disk {#to_disk tag="method"}
 
 Save the current state to a directory.
 

From 33ac24fe05a8e7e9bdb51d686b3386dea5f05c3d Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 21 Oct 2022 18:01:18 +0900
Subject: [PATCH 132/504] Remove thinc util reimports (#11665)

* Remove imports marked as v2 leftovers

There are a few functions that were in `spacy.util` in v2, but were
moved to Thinc. In v3 these were imported in `spacy.util` so that code
could be used unchanged, but the comment over them indicates they should
always be imported from Thinc. This commit removes those imports.

It doesn't look like any DeprecationWarning was ever thrown for using
these, but it is probably fine to remove them anyway with a major
version. It is not clear that they were widely used.

* Import fix_random_seed correctly

This seems to be the only place in spaCy that was using the old import.
---
 spacy/tests/pipeline/test_spancat.py | 7 +++----
 spacy/util.py                        | 8 +++-----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 9678e9b63b8..5dcc2e70f67 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -1,7 +1,6 @@
 import numpy
-import pytest
-from numpy.testing import assert_almost_equal, assert_array_equal
-from thinc.api import NumpyOps, Ragged, get_current_ops
+from numpy.testing import assert_array_equal, assert_almost_equal
+from thinc.api import get_current_ops, Ragged, fix_random_seed
 
 from spacy import util
 from spacy.lang.en import English
@@ -9,7 +8,7 @@
 from spacy.tokens import SpanGroup
 from spacy.tokens.span_groups import SpanGroups
 from spacy.training import Example
-from spacy.util import fix_random_seed, make_tempdir, registry
+from spacy.util import registry, make_tempdir
 
 OPS = get_current_ops()
 
diff --git a/spacy/util.py b/spacy/util.py
index c127be03c37..8068c4bcec9 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -63,12 +63,10 @@
 except ImportError:
     cupy = None
 
-# These are functions that were previously (v2.x) available from spacy.util
-# and have since moved to Thinc. We're importing them here so people's code
-# doesn't break, but they should always be imported from Thinc from now on,
-# not from spacy.util.
-from thinc.api import compounding, decaying, fix_random_seed  # noqa: F401
 
+from .symbols import ORTH
+from .compat import cupy, CudaStream, is_windows, importlib_metadata
+from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
 from . import about
 from .compat import CudaStream, cupy, importlib_metadata, is_windows
 from .errors import OLD_MODEL_SHORTCUTS, Errors, Warnings

From 06efa741862de708ca80a11f67fc854af52c3dff Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 24 Oct 2022 09:11:35 +0200
Subject: [PATCH 133/504] Replace EntityRuler with SpanRuler implementation
 (#11320)

* Replace EntityRuler with SpanRuler implementation

Remove `EntityRuler` and rename the `SpanRuler`-based
`future_entity_ruler` to `entity_ruler`.

Main changes:

* It is no longer possible to load patterns on init as with
`EntityRuler(patterns=)`.
* The older serialization formats (`patterns.jsonl`) are no longer
supported and the related tests are removed.
* The config settings are only stored in the config, not in the
serialized component (in particular the `phrase_matcher_attr` and
overwrite settings).

* Add migration guide to EntityRuler API docs

* docs update

* Minor edit

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/errors.py                               |   8 +-
 spacy/pipeline/__init__.py                    |   2 -
 spacy/pipeline/entity_ruler.py                | 541 ------------------
 spacy/pipeline/span_ruler.py                  |  23 +-
 spacy/tests/matcher/test_phrase_matcher.py    |   9 +-
 spacy/tests/pipeline/test_entity_ruler.py     | 259 +++------
 .../serialize/test_serialize_pipeline.py      |  67 +--
 website/docs/api/entityruler.mdx              | 311 ++--------
 website/docs/api/spanruler.mdx                |  13 +-
 website/docs/usage/101/_architecture.mdx      |  40 +-
 website/docs/usage/101/_pipelines.mdx         |   6 +-
 website/docs/usage/processing-pipelines.mdx   |   5 +-
 website/docs/usage/rule-based-matching.mdx    |  43 +-
 website/docs/usage/saving-loading.mdx         |  10 +-
 website/docs/usage/training.mdx               |   2 +-
 15 files changed, 245 insertions(+), 1094 deletions(-)
 delete mode 100644 spacy/pipeline/entity_ruler.py

diff --git a/spacy/errors.py b/spacy/errors.py
index 9814679eb7d..965c92066bc 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -470,13 +470,13 @@ class Errors(metaclass=ErrorsWithCodes):
             "same, but found '{nlp}' and '{vocab}' respectively.")
     E152 = ("The attribute {attr} is not supported for token patterns. "
             "Please use the option `validate=True` with the Matcher, PhraseMatcher, "
-            "EntityRuler or AttributeRuler for more details.")
+            "SpanRuler or AttributeRuler for more details.")
     E153 = ("The value type {vtype} is not supported for token patterns. "
             "Please use the option validate=True with Matcher, PhraseMatcher, "
-            "EntityRuler or AttributeRuler for more details.")
+            "SpanRuler or AttributeRuler for more details.")
     E154 = ("One of the attributes or values is not supported for token "
             "patterns. Please use the option `validate=True` with the Matcher, "
-            "PhraseMatcher, or EntityRuler for more details.")
+            "PhraseMatcher, or SpanRuler for more details.")
     E155 = ("The pipeline needs to include a {pipe} in order to use "
             "Matcher or PhraseMatcher with the attribute {attr}. "
             "Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` "
@@ -933,8 +933,6 @@ class Errors(metaclass=ErrorsWithCodes):
     E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
              "Non-UD tags should use the `tag` property.")
     E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
-    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
-             "exist.")
     E1024 = ("A pattern with {attr_type} '{label}' is not present in "
              "'{component}' patterns.")
     E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 82d24486a27..e26f7436efa 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -3,7 +3,6 @@
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
 from .ner import EntityRecognizer
-from .entity_ruler import EntityRuler
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .ner import EntityRecognizer
@@ -25,7 +24,6 @@
     "EditTreeLemmatizer",
     "EntityLinker",
     "EntityRecognizer",
-    "EntityRuler",
     "Morphologizer",
     "Lemmatizer",
     "MultiLabel_TextCategorizer",
diff --git a/spacy/pipeline/entity_ruler.py b/spacy/pipeline/entity_ruler.py
deleted file mode 100644
index 3683cfc0270..00000000000
--- a/spacy/pipeline/entity_ruler.py
+++ /dev/null
@@ -1,541 +0,0 @@
-import warnings
-from collections import defaultdict
-from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
-
-import srsly
-
-from ..errors import Errors, Warnings
-from ..language import Language
-from ..matcher import Matcher, PhraseMatcher
-from ..matcher.levenshtein import levenshtein_compare
-from ..scorer import get_ner_prf
-from ..tokens import Doc, Span
-from ..training import Example
-from ..util import SimpleFrozenList, ensure_path, from_disk, registry, to_disk
-from .pipe import Pipe
-
-DEFAULT_ENT_ID_SEP = "||"
-PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
-
-
-@Language.factory(
-    "entity_ruler",
-    assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
-    default_config={
-        "phrase_matcher_attr": None,
-        "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
-        "validate": False,
-        "overwrite_ents": False,
-        "ent_id_sep": DEFAULT_ENT_ID_SEP,
-        "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
-    },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
-)
-def make_entity_ruler(
-    nlp: Language,
-    name: str,
-    phrase_matcher_attr: Optional[Union[int, str]],
-    matcher_fuzzy_compare: Callable,
-    validate: bool,
-    overwrite_ents: bool,
-    ent_id_sep: str,
-    scorer: Optional[Callable],
-):
-    return EntityRuler(
-        nlp,
-        name,
-        phrase_matcher_attr=phrase_matcher_attr,
-        matcher_fuzzy_compare=matcher_fuzzy_compare,
-        validate=validate,
-        overwrite_ents=overwrite_ents,
-        ent_id_sep=ent_id_sep,
-        scorer=scorer,
-    )
-
-
-def entity_ruler_score(examples, **kwargs):
-    return get_ner_prf(examples)
-
-
-@registry.scorers("spacy.entity_ruler_scorer.v1")
-def make_entity_ruler_scorer():
-    return entity_ruler_score
-
-
-class EntityRuler(Pipe):
-    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
-    rules or exact phrase matches. It can be combined with the statistical
-    `EntityRecognizer` to boost accuracy, or used on its own to implement a
-    purely rule-based entity recognition system. After initialization, the
-    component is typically added to the pipeline using `nlp.add_pipe`.
-
-    DOCS: https://spacy.io/api/entityruler
-    USAGE: https://spacy.io/usage/rule-based-matching#entityruler
-    """
-
-    def __init__(
-        self,
-        nlp: Language,
-        name: str = "entity_ruler",
-        *,
-        phrase_matcher_attr: Optional[Union[int, str]] = None,
-        matcher_fuzzy_compare: Callable = levenshtein_compare,
-        validate: bool = False,
-        overwrite_ents: bool = False,
-        ent_id_sep: str = DEFAULT_ENT_ID_SEP,
-        patterns: Optional[List[PatternType]] = None,
-        scorer: Optional[Callable] = entity_ruler_score,
-    ) -> None:
-        """Initialize the entity ruler. If patterns are supplied here, they
-        need to be a list of dictionaries with a `"label"` and `"pattern"`
-        key. A pattern can either be a token pattern (list) or a phrase pattern
-        (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
-
-        nlp (Language): The shared nlp object to pass the vocab to the matchers
-            and process phrase patterns.
-        name (str): Instance name of the current pipeline component. Typically
-            passed in automatically from the factory when the component is
-            added. Used to disable the current entity ruler while creating
-            phrase patterns with the nlp object.
-        phrase_matcher_attr (int / str): Token attribute to match on, passed
-            to the internal PhraseMatcher as `attr`.
-        matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
-            internal Matcher. Defaults to
-            spacy.matcher.levenshtein.levenshtein_compare.
-        validate (bool): Whether patterns should be validated, passed to
-            Matcher and PhraseMatcher as `validate`
-        patterns (iterable): Optional patterns to load in.
-        overwrite_ents (bool): If existing entities are present, e.g. entities
-            added by the model, overwrite them by matches if necessary.
-        ent_id_sep (str): Separator used internally for entity IDs.
-        scorer (Optional[Callable]): The scoring method. Defaults to
-            spacy.scorer.get_ner_prf.
-
-        DOCS: https://spacy.io/api/entityruler#init
-        """
-        self.nlp = nlp
-        self.name = name
-        self.overwrite = overwrite_ents
-        self.token_patterns = defaultdict(list)  # type: ignore
-        self.phrase_patterns = defaultdict(list)  # type: ignore
-        self._validate = validate
-        self.matcher_fuzzy_compare = matcher_fuzzy_compare
-        self.matcher = Matcher(
-            nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare
-        )
-        self.phrase_matcher_attr = phrase_matcher_attr
-        self.phrase_matcher = PhraseMatcher(
-            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
-        )
-        self.ent_id_sep = ent_id_sep
-        self._ent_ids = defaultdict(tuple)  # type: ignore
-        if patterns is not None:
-            self.add_patterns(patterns)
-        self.scorer = scorer
-
-    def __len__(self) -> int:
-        """The number of all patterns added to the entity ruler."""
-        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
-        n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
-        return n_token_patterns + n_phrase_patterns
-
-    def __contains__(self, label: str) -> bool:
-        """Whether a label is present in the patterns."""
-        return label in self.token_patterns or label in self.phrase_patterns
-
-    def __call__(self, doc: Doc) -> Doc:
-        """Find matches in document and add them as entities.
-
-        doc (Doc): The Doc object in the pipeline.
-        RETURNS (Doc): The Doc with added entities, if available.
-
-        DOCS: https://spacy.io/api/entityruler#call
-        """
-        error_handler = self.get_error_handler()
-        try:
-            matches = self.match(doc)
-            self.set_annotations(doc, matches)
-            return doc
-        except Exception as e:
-            return error_handler(self.name, self, [doc], e)
-
-    def match(self, doc: Doc):
-        self._require_patterns()
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", message="\\[W036")
-            matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
-
-        final_matches = set(
-            [(m_id, start, end) for m_id, start, end in matches if start != end]
-        )
-        get_sort_key = lambda m: (m[2] - m[1], -m[1])
-        final_matches = sorted(final_matches, key=get_sort_key, reverse=True)
-        return final_matches
-
-    def set_annotations(self, doc, matches):
-        """Modify the document in place"""
-        entities = list(doc.ents)
-        new_entities = []
-        seen_tokens = set()
-        for match_id, start, end in matches:
-            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
-                continue
-            # check for end - 1 here because boundaries are inclusive
-            if start not in seen_tokens and end - 1 not in seen_tokens:
-                if match_id in self._ent_ids:
-                    label, ent_id = self._ent_ids[match_id]
-                    span = Span(doc, start, end, label=label, span_id=ent_id)
-                else:
-                    span = Span(doc, start, end, label=match_id)
-                new_entities.append(span)
-                entities = [
-                    e for e in entities if not (e.start < end and e.end > start)
-                ]
-                seen_tokens.update(range(start, end))
-        doc.ents = entities + new_entities
-
-    @property
-    def labels(self) -> Tuple[str, ...]:
-        """All labels present in the match patterns.
-
-        RETURNS (set): The string labels.
-
-        DOCS: https://spacy.io/api/entityruler#labels
-        """
-        keys = set(self.token_patterns.keys())
-        keys.update(self.phrase_patterns.keys())
-        all_labels = set()
-
-        for l in keys:
-            if self.ent_id_sep in l:
-                label, _ = self._split_label(l)
-                all_labels.add(label)
-            else:
-                all_labels.add(l)
-        return tuple(sorted(all_labels))
-
-    def initialize(
-        self,
-        get_examples: Callable[[], Iterable[Example]],
-        *,
-        nlp: Optional[Language] = None,
-        patterns: Optional[Sequence[PatternType]] = None,
-    ):
-        """Initialize the pipe for training.
-
-        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns a representative sample of gold-standard Example objects.
-        nlp (Language): The current nlp object the component is part of.
-        patterns Optional[Iterable[PatternType]]: The list of patterns.
-
-        DOCS: https://spacy.io/api/entityruler#initialize
-        """
-        self.clear()
-        if patterns:
-            self.add_patterns(patterns)  # type: ignore[arg-type]
-
-    @property
-    def ent_ids(self) -> Tuple[Optional[str], ...]:
-        """All entity ids present in the match patterns `id` properties
-
-        RETURNS (set): The string entity ids.
-
-        DOCS: https://spacy.io/api/entityruler#ent_ids
-        """
-        keys = set(self.token_patterns.keys())
-        keys.update(self.phrase_patterns.keys())
-        all_ent_ids = set()
-
-        for l in keys:
-            if self.ent_id_sep in l:
-                _, ent_id = self._split_label(l)
-                all_ent_ids.add(ent_id)
-        return tuple(all_ent_ids)
-
-    @property
-    def patterns(self) -> List[PatternType]:
-        """Get all patterns that were added to the entity ruler.
-
-        RETURNS (list): The original patterns, one dictionary per pattern.
-
-        DOCS: https://spacy.io/api/entityruler#patterns
-        """
-        all_patterns = []
-        for label, patterns in self.token_patterns.items():
-            for pattern in patterns:
-                ent_label, ent_id = self._split_label(label)
-                p = {"label": ent_label, "pattern": pattern}
-                if ent_id:
-                    p["id"] = ent_id
-                all_patterns.append(p)
-        for label, patterns in self.phrase_patterns.items():
-            for pattern in patterns:
-                ent_label, ent_id = self._split_label(label)
-                p = {"label": ent_label, "pattern": pattern.text}
-                if ent_id:
-                    p["id"] = ent_id
-                all_patterns.append(p)
-        return all_patterns
-
-    def add_patterns(self, patterns: List[PatternType]) -> None:
-        """Add patterns to the entity ruler. A pattern can either be a token
-        pattern (list of dicts) or a phrase pattern (string). For example:
-        {'label': 'ORG', 'pattern': 'Apple'}
-        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
-
-        patterns (list): The patterns to add.
-
-        DOCS: https://spacy.io/api/entityruler#add_patterns
-        """
-
-        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
-        try:
-            current_index = -1
-            for i, (name, pipe) in enumerate(self.nlp.pipeline):
-                if self == pipe:
-                    current_index = i
-                    break
-            subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
-        except ValueError:
-            subsequent_pipes = []
-        with self.nlp.select_pipes(disable=subsequent_pipes):
-            token_patterns = []
-            phrase_pattern_labels = []
-            phrase_pattern_texts = []
-            phrase_pattern_ids = []
-            for entry in patterns:
-                if isinstance(entry["pattern"], str):
-                    phrase_pattern_labels.append(entry["label"])
-                    phrase_pattern_texts.append(entry["pattern"])
-                    phrase_pattern_ids.append(entry.get("id"))
-                elif isinstance(entry["pattern"], list):
-                    token_patterns.append(entry)
-            phrase_patterns = []
-            for label, pattern, ent_id in zip(
-                phrase_pattern_labels,
-                self.nlp.pipe(phrase_pattern_texts),
-                phrase_pattern_ids,
-            ):
-                phrase_pattern = {"label": label, "pattern": pattern}
-                if ent_id:
-                    phrase_pattern["id"] = ent_id
-                phrase_patterns.append(phrase_pattern)
-            for entry in token_patterns + phrase_patterns:  # type: ignore[operator]
-                label = entry["label"]  # type: ignore
-                if "id" in entry:
-                    ent_label = label
-                    label = self._create_label(label, entry["id"])
-                    key = self.matcher._normalize_key(label)
-                    self._ent_ids[key] = (ent_label, entry["id"])
-                pattern = entry["pattern"]  # type: ignore
-                if isinstance(pattern, Doc):
-                    self.phrase_patterns[label].append(pattern)
-                    self.phrase_matcher.add(label, [pattern])  # type: ignore
-                elif isinstance(pattern, list):
-                    self.token_patterns[label].append(pattern)
-                    self.matcher.add(label, [pattern])
-                else:
-                    raise ValueError(Errors.E097.format(pattern=pattern))
-
-    def clear(self) -> None:
-        """Reset all patterns."""
-        self.token_patterns = defaultdict(list)
-        self.phrase_patterns = defaultdict(list)
-        self._ent_ids = defaultdict(tuple)
-        self.matcher = Matcher(
-            self.nlp.vocab,
-            validate=self._validate,
-            fuzzy_compare=self.matcher_fuzzy_compare,
-        )
-        self.phrase_matcher = PhraseMatcher(
-            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
-        )
-
-    def remove(self, ent_id: str) -> None:
-        """Remove a pattern by its ent_id if a pattern with this ent_id was added before
-
-        ent_id (str): id of the pattern to be removed
-        RETURNS: None
-        DOCS: https://spacy.io/api/entityruler#remove
-        """
-        label_id_pairs = [
-            (label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id
-        ]
-        if not label_id_pairs:
-            raise ValueError(
-                Errors.E1024.format(attr_type="ID", label=ent_id, component=self.name)
-            )
-        created_labels = [
-            self._create_label(label, eid) for (label, eid) in label_id_pairs
-        ]
-        # remove the patterns from self.phrase_patterns
-        self.phrase_patterns = defaultdict(
-            list,
-            {
-                label: val
-                for (label, val) in self.phrase_patterns.items()
-                if label not in created_labels
-            },
-        )
-        # remove the patterns from self.token_pattern
-        self.token_patterns = defaultdict(
-            list,
-            {
-                label: val
-                for (label, val) in self.token_patterns.items()
-                if label not in created_labels
-            },
-        )
-        # remove the patterns from self.token_pattern
-        for label in created_labels:
-            if label in self.phrase_matcher:
-                self.phrase_matcher.remove(label)
-            else:
-                self.matcher.remove(label)
-
-    def _require_patterns(self) -> None:
-        """Raise a warning if this component has no patterns defined."""
-        if len(self) == 0:
-            warnings.warn(Warnings.W036.format(name=self.name))
-
-    def _split_label(self, label: str) -> Tuple[str, Optional[str]]:
-        """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
-
-        label (str): The value of label in a pattern entry
-        RETURNS (tuple): ent_label, ent_id
-        """
-        if self.ent_id_sep in label:
-            ent_label, ent_id = label.rsplit(self.ent_id_sep, 1)
-        else:
-            ent_label = label
-            ent_id = None  # type: ignore
-        return ent_label, ent_id
-
-    def _create_label(self, label: Any, ent_id: Any) -> str:
-        """Join Entity label with ent_id if the pattern has an `id` attribute
-        If ent_id is not a string, the label is returned as is.
-
-        label (str): The label to set for ent.label_
-        ent_id (str): The label
-        RETURNS (str): The ent_label joined with configured `ent_id_sep`
-        """
-        if isinstance(ent_id, str):
-            label = f"{label}{self.ent_id_sep}{ent_id}"
-        return label
-
-    def from_bytes(
-        self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityRuler":
-        """Load the entity ruler from a bytestring.
-
-        patterns_bytes (bytes): The bytestring to load.
-        RETURNS (EntityRuler): The loaded entity ruler.
-
-        DOCS: https://spacy.io/api/entityruler#from_bytes
-        """
-        cfg = srsly.msgpack_loads(patterns_bytes)
-        self.clear()
-        if isinstance(cfg, dict):
-            self.add_patterns(cfg.get("patterns", cfg))
-            self.overwrite = cfg.get("overwrite", False)
-            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab,
-                attr=self.phrase_matcher_attr,
-            )
-            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
-        else:
-            self.add_patterns(cfg)
-        return self
-
-    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
-        """Serialize the entity ruler patterns to a bytestring.
-
-        RETURNS (bytes): The serialized patterns.
-
-        DOCS: https://spacy.io/api/entityruler#to_bytes
-        """
-        serial = {
-            "overwrite": self.overwrite,
-            "ent_id_sep": self.ent_id_sep,
-            "phrase_matcher_attr": self.phrase_matcher_attr,
-            "patterns": self.patterns,
-        }
-        return srsly.msgpack_dumps(serial)
-
-    def from_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityRuler":
-        """Load the entity ruler from a file. Expects a file containing
-        newline-delimited JSON (JSONL) with one entry per line.
-
-        path (str / Path): The JSONL file to load.
-        RETURNS (EntityRuler): The loaded entity ruler.
-
-        DOCS: https://spacy.io/api/entityruler#from_disk
-        """
-        path = ensure_path(path)
-        self.clear()
-        depr_patterns_path = path.with_suffix(".jsonl")
-        if path.suffix == ".jsonl":  # user provides a jsonl
-            if path.is_file:
-                patterns = srsly.read_jsonl(path)
-                self.add_patterns(patterns)
-            else:
-                raise ValueError(Errors.E1023.format(path=path))
-        elif depr_patterns_path.is_file():
-            patterns = srsly.read_jsonl(depr_patterns_path)
-            self.add_patterns(patterns)
-        elif path.is_dir():  # path is a valid directory
-            cfg = {}
-            deserializers_patterns = {
-                "patterns": lambda p: self.add_patterns(
-                    srsly.read_jsonl(p.with_suffix(".jsonl"))
-                )
-            }
-            deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
-            from_disk(path, deserializers_cfg, {})
-            self.overwrite = cfg.get("overwrite", False)
-            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
-            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
-
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab, attr=self.phrase_matcher_attr
-            )
-            from_disk(path, deserializers_patterns, {})
-        else:  # path is not a valid directory or file
-            raise ValueError(Errors.E146.format(path=path))
-        return self
-
-    def to_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> None:
-        """Save the entity ruler patterns to a directory. The patterns will be
-        saved as newline-delimited JSON (JSONL).
-
-        path (str / Path): The JSONL file to save.
-
-        DOCS: https://spacy.io/api/entityruler#to_disk
-        """
-        path = ensure_path(path)
-        cfg = {
-            "overwrite": self.overwrite,
-            "phrase_matcher_attr": self.phrase_matcher_attr,
-            "ent_id_sep": self.ent_id_sep,
-        }
-        serializers = {
-            "patterns": lambda p: srsly.write_jsonl(
-                p.with_suffix(".jsonl"), self.patterns
-            ),
-            "cfg": lambda p: srsly.write_json(p, cfg),
-        }
-        if path.suffix == ".jsonl":  # user wants to save only JSONL
-            srsly.write_jsonl(path, self.patterns)
-        else:
-            to_disk(path, serializers, {})
diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index 2a5e2179a35..4875c5e4bff 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -17,6 +17,14 @@
 
 import srsly
 
+from .pipe import Pipe
+from ..training import Example
+from ..language import Language
+from ..errors import Errors, Warnings
+from ..util import ensure_path, SimpleFrozenList, registry
+from ..tokens import Doc, Span
+from ..scorer import Scorer, get_ner_prf
+from ..matcher import Matcher, PhraseMatcher
 from .. import util
 from ..errors import Errors, Warnings
 from ..language import Language
@@ -33,7 +41,7 @@
 
 
 @Language.factory(
-    "future_entity_ruler",
+    "entity_ruler",
     assigns=["doc.ents"],
     default_config={
         "phrase_matcher_attr": None,
@@ -79,6 +87,15 @@ def make_entity_ruler(
     )
 
 
+def entity_ruler_score(examples, **kwargs):
+    return get_ner_prf(examples)
+
+
+@registry.scorers("spacy.entity_ruler_scorer.v1")
+def make_entity_ruler_scorer():
+    return entity_ruler_score
+
+
 @Language.factory(
     "span_ruler",
     assigns=["doc.spans"],
@@ -136,7 +153,7 @@ def prioritize_new_ents_filter(
 ) -> List[Span]:
     """Merge entities and spans into one list without overlaps by allowing
     spans to overwrite any entities that they overlap with. Intended to
-    replicate the overwrite_ents=True behavior from the EntityRuler.
+    replicate the overwrite_ents=True behavior from the v3 EntityRuler.
 
     entities (Iterable[Span]): The entities, already filtered for overlaps.
     spans (Iterable[Span]): The spans to merge, may contain overlaps.
@@ -167,7 +184,7 @@ def prioritize_existing_ents_filter(
 ) -> List[Span]:
     """Merge entities and spans into one list without overlaps by prioritizing
     existing entities. Intended to replicate the overwrite_ents=False behavior
-    from the EntityRuler.
+    from the v3 EntityRuler.
 
     entities (Iterable[Span]): The entities, already filtered for overlaps.
     spans (Iterable[Span]): The spans to merge, may contain overlaps.
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 4ad234cba3b..629f402f38e 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -87,14 +87,15 @@ def test_issue4373():
 
 @pytest.mark.issue(4651)
 def test_issue4651_with_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialized correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
+    """Test that the entity_ruler PhraseMatcher is deserialized correctly using
+    the method from_disk when the entity_ruler argument phrase_matcher_attr is
     specified.
     """
     text = "Spacy is a python library for nlp"
     nlp = English()
     patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
+    config = {"phrase_matcher_attr": "LOWER"}
+    ruler = nlp.add_pipe("entity_ruler", config=config)
     ruler.add_patterns(patterns)
     doc = nlp(text)
     res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
@@ -102,7 +103,7 @@ def test_issue4651_with_phrase_matcher_attr():
     with make_tempdir() as d:
         file_path = d / "entityruler"
         ruler.to_disk(file_path)
-        nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path)
+        nlp_reloaded.add_pipe("entity_ruler", config=config).from_disk(file_path)
     doc_reloaded = nlp_reloaded(text)
     res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
     assert res == res_reloaded
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index d0ab003919e..9f5204006ec 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -2,6 +2,12 @@
 from thinc.api import NumpyOps, get_current_ops
 
 from spacy import registry
+from spacy.tokens import Doc, Span
+from spacy.language import Language
+from spacy.lang.en import English
+from spacy.pipeline import EntityRecognizer, merge_entities
+from spacy.pipeline import SpanRuler
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
 from spacy.lang.en import English
 from spacy.language import Language
@@ -10,8 +16,6 @@
 from spacy.tests.util import make_tempdir
 from spacy.tokens import Doc, Span
 
-ENTITY_RULERS = ["entity_ruler", "future_entity_ruler"]
-
 
 @pytest.fixture
 def nlp():
@@ -38,13 +42,12 @@ def add_ent_component(doc):
 
 
 @pytest.mark.issue(3345)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue3345(entity_ruler_factory):
+def test_issue3345():
     """Test case where preset entity crosses sentence boundary."""
     nlp = English()
     doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
     doc[4].is_sent_start = True
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns([{"label": "GPE", "pattern": "New York"}])
     cfg = {"model": DEFAULT_NER_MODEL}
     model = registry.resolve(cfg, validate=True)["model"]
@@ -63,15 +66,14 @@ def test_issue3345(entity_ruler_factory):
 
 
 @pytest.mark.issue(4849)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue4849(entity_ruler_factory):
+def test_issue4849():
     nlp = English()
     patterns = [
         {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
         {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
     ]
     ruler = nlp.add_pipe(
-        entity_ruler_factory,
+        "entity_ruler",
         name="entity_ruler",
         config={"phrase_matcher_attr": "LOWER"},
     )
@@ -94,11 +96,10 @@ def test_issue4849(entity_ruler_factory):
 
 
 @pytest.mark.issue(5918)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue5918(entity_ruler_factory):
+def test_issue5918():
     # Test edge case when merging entities.
     nlp = English()
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "ORG", "pattern": "Digicon Inc"},
         {"label": "ORG", "pattern": "Rotan Mosle Inc's"},
@@ -123,10 +124,9 @@ def test_issue5918(entity_ruler_factory):
 
 
 @pytest.mark.issue(8168)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue8168(entity_ruler_factory):
+def test_issue8168():
     nlp = English()
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "ORG", "pattern": "Apple"},
         {
@@ -146,12 +146,9 @@ def test_issue8168(entity_ruler_factory):
 
 
 @pytest.mark.issue(8216)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_fix8216(nlp, patterns):
     """Test that patterns don't get added excessively."""
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"validate": True}
-    )
+    ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
     ruler.add_patterns(patterns)
     pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
     assert pattern_count > 0
@@ -160,16 +157,15 @@ def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory):
     assert after_count == pattern_count
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_init(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_init(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     assert "HELLO" in ruler
     assert "BYE" in ruler
     nlp.remove_pipe("entity_ruler")
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     doc = nlp("hello world bye bye")
     assert len(doc.ents) == 2
@@ -177,23 +173,21 @@ def test_entity_ruler_init(nlp, patterns, entity_ruler_factory):
     assert doc.ents[1].label_ == "BYE"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_no_patterns_warns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_no_patterns_warns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     assert len(ruler) == 0
     assert len(ruler.labels) == 0
     nlp.remove_pipe("entity_ruler")
-    nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    nlp.add_pipe("entity_ruler")
     assert nlp.pipe_names == ["entity_ruler"]
     with pytest.warns(UserWarning):
         doc = nlp("hello world bye bye")
     assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_init_patterns(nlp, patterns):
     # initialize with patterns
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     assert len(ruler.labels) == 0
     ruler.initialize(lambda: [], patterns=patterns)
     assert len(ruler.labels) == 4
@@ -205,7 +199,7 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
     nlp.config["initialize"]["components"]["entity_ruler"] = {
         "patterns": {"@misc": "entity_ruler_patterns"}
     }
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     assert len(ruler.labels) == 0
     nlp.initialize()
     assert len(ruler.labels) == 4
@@ -214,20 +208,18 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
     assert doc.ents[1].label_ == "BYE"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_init_clear(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_init_clear(nlp, patterns):
     """Test that initialization clears patterns."""
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler.labels) == 4
     ruler.initialize(lambda: [])
     assert len(ruler.labels) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_clear(nlp, patterns):
     """Test that initialization clears patterns."""
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler.labels) == 4
     doc = nlp("hello world")
@@ -239,9 +231,8 @@ def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory):
     assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_existing(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     nlp.add_pipe("add_ent", before="entity_ruler")
     doc = nlp("OH HELLO WORLD bye bye")
@@ -250,11 +241,8 @@ def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory):
     assert doc.ents[1].label_ == "BYE"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
-    )
+def test_entity_ruler_existing_overwrite(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
     ruler.add_patterns(patterns)
     nlp.add_pipe("add_ent", before="entity_ruler")
     doc = nlp("OH HELLO WORLD bye bye")
@@ -264,11 +252,8 @@ def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory):
     assert doc.ents[1].label_ == "BYE"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
-    )
+def test_entity_ruler_existing_complex(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
     ruler.add_patterns(patterns)
     nlp.add_pipe("add_ent", before="entity_ruler")
     doc = nlp("foo foo bye bye")
@@ -279,11 +264,8 @@ def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory):
     assert len(doc.ents[1]) == 2
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
-    )
+def test_entity_ruler_entity_id(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
     ruler.add_patterns(patterns)
     doc = nlp("Apple is a technology company")
     assert len(doc.ents) == 1
@@ -291,26 +273,23 @@ def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory):
     assert doc.ents[0].ent_id_ == "a1"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_cfg_ent_id_sep(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_cfg_ent_id_sep(nlp, patterns):
     config = {"overwrite_ents": True, "ent_id_sep": "**"}
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler", config=config)
+    ruler = nlp.add_pipe("entity_ruler", config=config)
     ruler.add_patterns(patterns)
     doc = nlp("Apple is a technology company")
-    if isinstance(ruler, EntityRuler):
-        assert "TECH_ORG**a1" in ruler.phrase_patterns
     assert len(doc.ents) == 1
     assert doc.ents[0].label_ == "TECH_ORG"
     assert doc.ents[0].ent_id_ == "a1"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory):
-    ruler = EntityRuler(nlp, patterns=patterns)
+def test_entity_ruler_serialize_bytes(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     ruler_bytes = ruler.to_bytes()
-    new_ruler = EntityRuler(nlp)
+    new_ruler = nlp.add_pipe("entity_ruler", name="new_ruler")
     assert len(new_ruler) == 0
     assert len(new_ruler.labels) == 0
     new_ruler = new_ruler.from_bytes(ruler_bytes)
@@ -322,28 +301,27 @@ def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory):
     assert sorted(new_ruler.labels) == sorted(ruler.labels)
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_phrase_matcher_attr_bytes(
-    nlp, patterns, entity_ruler_factory
-):
-    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER", patterns=patterns)
+def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
+    ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     ruler_bytes = ruler.to_bytes()
-    new_ruler = EntityRuler(nlp)
+    new_ruler = nlp.add_pipe(
+        "entity_ruler", name="new_ruler", config={"phrase_matcher_attr": "LOWER"}
+    )
     assert len(new_ruler) == 0
     assert len(new_ruler.labels) == 0
-    assert new_ruler.phrase_matcher_attr is None
     new_ruler = new_ruler.from_bytes(ruler_bytes)
     assert len(new_ruler) == len(patterns)
     assert len(new_ruler.labels) == 4
-    assert new_ruler.phrase_matcher_attr == "LOWER"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_validate(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
-    validated_ruler = EntityRuler(nlp, validate=True)
+def test_entity_ruler_validate(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
+    validated_ruler = nlp.add_pipe(
+        "entity_ruler", name="validated_ruler", config={"validate": True}
+    )
 
     valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
     invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
@@ -360,16 +338,15 @@ def test_entity_ruler_validate(nlp, entity_ruler_factory):
         validated_ruler.add_patterns([invalid_pattern])
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_properties(nlp, patterns, entity_ruler_factory):
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+def test_entity_ruler_properties(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
+    ruler.add_patterns(patterns)
     assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"])
-    assert sorted(ruler.ent_ids) == ["a1", "a2"]
+    assert sorted(ruler.ids) == ["a1", "a2"]
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_overlapping_spans(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "FOOBAR", "pattern": "foo bar"},
         {"label": "BARBAZ", "pattern": "bar baz"},
@@ -418,14 +395,13 @@ def make_test_fuzzy_compare_disabled():
 
 
 @pytest.mark.parametrize("n_process", [1, 2])
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
+def test_entity_ruler_multiprocessing(nlp, n_process):
     if isinstance(get_current_ops, NumpyOps) or n_process < 2:
         texts = ["I enjoy eating Pizza Hut pizza."]
 
         patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}]
 
-        ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+        ruler = nlp.add_pipe("entity_ruler")
         ruler.add_patterns(patterns)
 
         for doc in nlp.pipe(texts, n_process=2):
@@ -433,9 +409,8 @@ def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
                 assert ent.ent_id_ == "1234"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_serialize_jsonl(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     with make_tempdir() as d:
         ruler.to_disk(d / "test_ruler.jsonl")
@@ -444,9 +419,8 @@ def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory):
             ruler.from_disk(d / "non_existing.jsonl")  # read from a bad jsonl file
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_serialize_dir(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     with make_tempdir() as d:
         ruler.to_disk(d / "test_ruler")
@@ -455,9 +429,8 @@ def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory):
             ruler.from_disk(d / "non_existing_dir")  # read from a bad directory
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_basic(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_basic(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -467,24 +440,16 @@ def test_entity_ruler_remove_basic(nlp, entity_ruler_factory):
     doc = nlp("Dina went to school")
     assert len(ruler.patterns) == 3
     assert len(doc.ents) == 1
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" in ruler.phrase_matcher
     assert doc.ents[0].label_ == "PERSON"
     assert doc.ents[0].text == "Dina"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
     doc = nlp("Dina went to school")
     assert len(doc.ents) == 0
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" not in ruler.phrase_matcher
     assert len(ruler.patterns) == 2
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_same_id_multiple_patterns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "DinaCorp", "id": "dina"},
@@ -493,25 +458,15 @@ def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory
     ruler.add_patterns(patterns)
     doc = nlp("Dina founded DinaCorp and ACME.")
     assert len(ruler.patterns) == 3
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" in ruler.phrase_matcher
-        assert "ORG||dina" in ruler.phrase_matcher
     assert len(doc.ents) == 3
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
     doc = nlp("Dina founded DinaCorp and ACME.")
     assert len(ruler.patterns) == 1
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" not in ruler.phrase_matcher
-        assert "ORG||dina" not in ruler.phrase_matcher
     assert len(doc.ents) == 1
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_nonexisting_pattern(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -526,9 +481,8 @@ def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory):
             ruler.remove_by_id("nepattern")
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_several_patterns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -542,27 +496,20 @@ def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory):
     assert doc.ents[0].text == "Dina"
     assert doc.ents[1].label_ == "ORG"
     assert doc.ents[1].text == "ACME"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
     doc = nlp("Dina founded her company ACME")
     assert len(ruler.patterns) == 2
     assert len(doc.ents) == 1
     assert doc.ents[0].label_ == "ORG"
     assert doc.ents[0].text == "ACME"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("acme")
-    else:
-        ruler.remove_by_id("acme")
+    ruler.remove_by_id("acme")
     doc = nlp("Dina founded her company ACME")
     assert len(ruler.patterns) == 1
     assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_patterns_in_a_row(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -578,21 +525,15 @@ def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory):
     assert doc.ents[1].text == "ACME"
     assert doc.ents[2].label_ == "DATE"
     assert doc.ents[2].text == "her birthday"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-        ruler.remove("acme")
-        ruler.remove("bday")
-    else:
-        ruler.remove_by_id("dina")
-        ruler.remove_by_id("acme")
-        ruler.remove_by_id("bday")
+    ruler.remove_by_id("dina")
+    ruler.remove_by_id("acme")
+    ruler.remove_by_id("bday")
     doc = nlp("Dina went to school")
     assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_all_patterns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -600,29 +541,19 @@ def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory):
     ]
     ruler.add_patterns(patterns)
     assert len(ruler.patterns) == 3
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
     assert len(ruler.patterns) == 2
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("acme")
-    else:
-        ruler.remove_by_id("acme")
+    ruler.remove_by_id("acme")
     assert len(ruler.patterns) == 1
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("bday")
-    else:
-        ruler.remove_by_id("bday")
+    ruler.remove_by_id("bday")
     assert len(ruler.patterns) == 0
     with pytest.warns(UserWarning):
         doc = nlp("Dina founded her company ACME on her birthday")
         assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_and_add(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "DATE", "pattern": "last time"}]
     ruler.add_patterns(patterns)
     doc = ruler(
@@ -643,10 +574,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
     assert doc.ents[0].text == "last time"
     assert doc.ents[1].label_ == "DATE"
     assert doc.ents[1].text == "this time"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("ttime")
-    else:
-        ruler.remove_by_id("ttime")
+    ruler.remove_by_id("ttime")
     doc = ruler(
         nlp.make_doc("I saw him last time we met, this time he brought some flowers")
     )
@@ -669,10 +597,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
     )
     assert len(ruler.patterns) == 3
     assert len(doc.ents) == 3
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("ttime")
-    else:
-        ruler.remove_by_id("ttime")
+    ruler.remove_by_id("ttime")
     doc = ruler(
         nlp.make_doc(
             "I saw him last time we met, this time he brought some flowers, another time some chocolate."
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 6bbe743a12d..8170488f758 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -8,15 +8,9 @@
 from spacy import Vocab, load, registry
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import (
-    DependencyParser,
-    EntityRecognizer,
-    EntityRuler,
-    SentenceRecognizer,
-    Tagger,
-    TextCategorizer,
-    TrainablePipe,
-)
+from spacy.pipeline import DependencyParser, EntityRecognizer
+from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer
+from spacy.pipeline import TrainablePipe
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
@@ -91,58 +85,17 @@ def test_issue_3526_1(en_vocab):
         {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
     ]
     nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
+    ruler.add_patterns(patterns)
     ruler_bytes = ruler.to_bytes()
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
-    assert ruler.overwrite
-    new_ruler = EntityRuler(nlp)
+    new_ruler = nlp.add_pipe(
+        "entity_ruler", name="new_ruler", config={"overwrite_ents": True}
+    )
     new_ruler = new_ruler.from_bytes(ruler_bytes)
     assert len(new_ruler) == len(ruler)
     assert len(new_ruler.labels) == 4
-    assert new_ruler.overwrite == ruler.overwrite
-    assert new_ruler.ent_id_sep == ruler.ent_id_sep
-
-
-@pytest.mark.issue(3526)
-def test_issue_3526_2(en_vocab):
-    patterns = [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-    ]
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
-    new_ruler = EntityRuler(nlp)
-    new_ruler = new_ruler.from_bytes(bytes_old_style)
-    assert len(new_ruler) == len(ruler)
-    for pattern in ruler.patterns:
-        assert pattern in new_ruler.patterns
-    assert new_ruler.overwrite is not ruler.overwrite
-
-
-@pytest.mark.issue(3526)
-def test_issue_3526_3(en_vocab):
-    patterns = [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-    ]
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    with make_tempdir() as tmpdir:
-        out_file = tmpdir / "entity_ruler"
-        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
-        new_ruler = EntityRuler(nlp).from_disk(out_file)
-        for pattern in ruler.patterns:
-            assert pattern in new_ruler.patterns
-        assert len(new_ruler) == len(ruler)
-        assert new_ruler.overwrite is not ruler.overwrite
 
 
 @pytest.mark.issue(3526)
@@ -156,16 +109,14 @@ def test_issue_3526_4(en_vocab):
         nlp.to_disk(tmpdir)
         ruler = nlp.get_pipe("entity_ruler")
         assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert ruler.overwrite is True
         nlp2 = load(tmpdir)
         new_ruler = nlp2.get_pipe("entity_ruler")
         assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert new_ruler.overwrite is True
 
 
 @pytest.mark.issue(4042)
 def test_issue4042():
-    """Test that serialization of an EntityRuler before NER works fine."""
+    """Test that serialization of an entity_ruler before NER works fine."""
     nlp = English()
     # add ner pipe
     ner = nlp.add_pipe("ner")
diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx
index a35b6e2566c..7976e7725e0 100644
--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@@ -1,13 +1,24 @@
 ---
 title: EntityRuler
-tag: class
-source: spacy/pipeline/entity_ruler.py
 new: 2.1
 teaser: 'Pipeline component for rule-based named entity recognition'
 api_string_name: entity_ruler
 api_trainable: false
 ---
 
+<Infobox title="New in v4" variant="warning">
+
+As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is
+implemented as a special case of the `SpanRuler` component.
+
+See the [migration guide](#migrating) below for differences between the v3
+`EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler`
+component.
+
+See the [`SpanRuler`](/api/spanruler) API docs for the full API.
+
+</Infobox>
+
 The entity ruler lets you add spans to the [`Doc.ents`](/api/doc#ents) using
 token-based rules or exact phrase matches. It can be combined with the
 statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or
@@ -64,273 +75,51 @@ how the component should be configured. You can override its settings via the
 | `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                       |
 | `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                 |
 
-```python
-%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py
-```
-
-## EntityRuler.\_\_init\_\_ {id="init",tag="method"}
-
-Initialize the entity ruler. If patterns are supplied here, they need to be a
-list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either
-be a token pattern (list) or a phrase pattern (string). For example:
-`{"label": "ORG", "pattern": "Apple"}`.
-
-> #### Example
->
-> ```python
-> # Construction via add_pipe
-> ruler = nlp.add_pipe("entity_ruler")
->
-> # Construction from class
-> from spacy.pipeline import EntityRuler
-> ruler = EntityRuler(nlp, overwrite_ents=True)
-> ```
-
-| Name                                                 | Description                                                                                                                                                                                                                           |
-| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `nlp`                                                | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~                                                                                                                                     |
-| `name` <Tag variant="new">3</Tag>                    | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
-| _keyword-only_                                       |                                                                                                                                                                                                                                       |
-| `phrase_matcher_attr`                                | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~                                         |
-| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~                                                                                                     |
-| `validate`                                           | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~                                                                                                                |
-| `overwrite_ents`                                     | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~                                                                                             |
-| `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                                                               |
-| `patterns`                                           | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~                                                                                                                                 |
-| `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                                                         |
-
-## EntityRuler.initialize {id="initialize",tag="method",version="3"}
-
-Initialize the component with data and used before training to load in rules
-from a [pattern file](/usage/rule-based-matching/#entityruler-files). This
-method is typically called by [`Language.initialize`](/api/language#initialize)
-and lets you customize arguments it receives via the
-[`[initialize.components]`](/api/data-formats#config-initialize) block in the
-config.
-
-> #### Example
->
-> ```python
-> entity_ruler = nlp.add_pipe("entity_ruler")
-> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
-> ```
->
-> ```ini
-> ### config.cfg
-> [initialize.components.entity_ruler]
->
-> [initialize.components.entity_ruler.patterns]
-> @readers = "srsly.read_jsonl.v1"
-> path = "corpus/entity_ruler_patterns.jsonl
-> ```
-
-| Name           | Description                                                                                                                                                          |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
-| _keyword-only_ |                                                                                                                                                                      |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                 |
-| `patterns`     | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~                                                        |
-
-## EntityRuler.\_\_len\_\_ {id="len",tag="method"}
-
-The number of all patterns added to the entity ruler.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> assert len(ruler) == 0
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-> assert len(ruler) == 1
-> ```
-
-| Name        | Description                     |
-| ----------- | ------------------------------- |
-| **RETURNS** | The number of patterns. ~~int~~ |
-
-## EntityRuler.\_\_contains\_\_ {id="contains",tag="method"}
-
-Whether a label is present in the patterns.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-> assert "ORG" in ruler
-> assert not "PERSON" in ruler
-> ```
-
-| Name        | Description                                           |
-| ----------- | ----------------------------------------------------- |
-| `label`     | The label to check. ~~str~~                           |
-| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ |
-
-## EntityRuler.\_\_call\_\_ {id="call",tag="method"}
-
-Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
-happens automatically after the component has been added to the pipeline using
-[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
-with `overwrite_ents=True`, existing entities will be replaced if they overlap
-with the matches. When matches overlap in a Doc, the entity ruler prioritizes
-longer patterns over shorter, and if equal the match occuring first in the Doc
-is chosen.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
->
-> doc = nlp("A text about Apple.")
-> ents = [(ent.text, ent.label_) for ent in doc.ents]
-> assert ents == [("Apple", "ORG")]
-> ```
-
-| Name        | Description                                                          |
-| ----------- | -------------------------------------------------------------------- |
-| `doc`       | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
-| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~        |
+## Migrating from v3 {#migrating}
 
-## EntityRuler.add_patterns {id="add_patterns",tag="method"}
+### Loading patterns
 
-Add patterns to the entity ruler. A pattern can either be a token pattern (list
-of dicts) or a phrase pattern (string). For more details, see the usage guide on
-[rule-based matching](/usage/rule-based-matching).
+Unlike the v3 `EntityRuler`, the `SpanRuler` cannot load patterns on
+initialization with `SpanRuler(patterns=patterns)` or directly from a JSONL file
+path with `SpanRuler.from_disk(jsonl_path)`. Patterns should be loaded from the
+JSONL file separately and then added through
+[`SpanRuler.initialize`](/api/spanruler#initialize]) or
+[`SpanRuler.add_patterns`](/api/spanruler#add_patterns).
 
-> #### Example
->
-> ```python
-> patterns = [
->     {"label": "ORG", "pattern": "Apple"},
->     {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}
-> ]
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns(patterns)
-> ```
-
-| Name       | Description                                                      |
-| ---------- | ---------------------------------------------------------------- |
-| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
-
-## EntityRuler.remove {id="remove",tag="method",version="3.2.1"}
-
-Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if
-the ID does not exist.
-
-> #### Example
->
-> ```python
-> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}]
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns(patterns)
-> ruler.remove("apple")
-> ```
-
-| Name | Description                         |
-| ---- | ----------------------------------- |
-| `id` | The ID of the pattern rule. ~~str~~ |
-
-## EntityRuler.to_disk {id="to_disk",tag="method"}
-
-Save the entity ruler patterns to a directory. The patterns will be saved as
-newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided,
-only the patterns are saved as JSONL. If a directory name is provided, a
-`patterns.jsonl` and `cfg` file with the component configuration is exported.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.to_disk("/path/to/patterns.jsonl")  # saves patterns only
-> ruler.to_disk("/path/to/entity_ruler")    # saves patterns and config
-> ```
-
-| Name   | Description                                                                                                                                              |
-| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-
-## EntityRuler.from_disk {id="from_disk",tag="method"}
-
-Load the entity ruler from a path. Expects either a file containing
-newline-delimited JSON (JSONL) with one entry per line, or a directory
-containing a `patterns.jsonl` file and a `cfg` file with the component
-configuration.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.from_disk("/path/to/patterns.jsonl")  # loads patterns only
-> ruler.from_disk("/path/to/entity_ruler")    # loads patterns and config
-> ```
-
-| Name        | Description                                                                                                   |
-| ----------- | ------------------------------------------------------------------------------------------------------------- |
-| `path`      | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~                                                            |
-
-## EntityRuler.to_bytes {id="to_bytes",tag="method"}
-
-Serialize the entity ruler patterns to a bytestring.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler_bytes = ruler.to_bytes()
-> ```
-
-| Name        | Description                        |
-| ----------- | ---------------------------------- |
-| **RETURNS** | The serialized patterns. ~~bytes~~ |
-
-## EntityRuler.from_bytes {id="from_bytes",tag="method"}
-
-Load the pipe from a bytestring. Modifies the object in place and returns it.
-
-> #### Example
->
-> ```python
-> ruler_bytes = ruler.to_bytes()
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.from_bytes(ruler_bytes)
-> ```
-
-| Name         | Description                                        |
-| ------------ | -------------------------------------------------- |
-| `bytes_data` | The bytestring to load. ~~bytes~~                  |
-| **RETURNS**  | The modified `EntityRuler` object. ~~EntityRuler~~ |
-
-## EntityRuler.labels {id="labels",tag="property"}
-
-All labels present in the match patterns.
-
-| Name        | Description                            |
-| ----------- | -------------------------------------- |
-| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.from_disk("patterns.jsonl")
++ import srsly
++ patterns = srsly.read_jsonl("patterns.jsonl")
++ ruler.add_patterns(patterns)
+```
 
-## EntityRuler.ent_ids {id="ent_ids",tag="property",version="2.2.2"}
+### Saving patterns
 
-All entity IDs present in the `id` properties of the match patterns.
+`SpanRuler.to_disk` always saves the full component data to a directory and does
+not include an option to save the patterns to a single JSONL file.
 
-| Name        | Description                         |
-| ----------- | ----------------------------------- |
-| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.to_disk("patterns.jsonl")
++ import srsly
++ srsly.write_jsonl("patterns.jsonl", ruler.patterns)
+```
 
-## EntityRuler.patterns {id="patterns",tag="property"}
+### Accessing token and phrase patterns
 
-Get all patterns that were added to the entity ruler.
+The separate token patterns and phrase patterns are no longer accessible under
+`ruler.token_patterns` or `ruler.phrase_patterns`. You can access the combined
+patterns in their original format using the property
+[`SpanRuler.patterns`](/api/spanruler#patterns).
 
-| Name        | Description                                                                              |
-| ----------- | ---------------------------------------------------------------------------------------- |
-| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ |
+### Removing patterns by ID
 
-## Attributes {id="attributes"}
+[`SpanRuler.remove`](/api/spanruler#remove) removes by label rather than ID. To
+remove by ID, use [`SpanRuler.remove_by_id`](/api/spanruler#remove_by_id):
 
-| Name              | Description                                                                                                           |
-| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `matcher`         | The underlying matcher used to process token patterns. ~~Matcher~~                                                    |
-| `phrase_matcher`  | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~                                      |
-| `token_patterns`  | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
-| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~                             |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.remove("id")
++ ruler.remove_by_id("id")
+```
diff --git a/website/docs/api/spanruler.mdx b/website/docs/api/spanruler.mdx
index 5889b1906ad..1b6c558acef 100644
--- a/website/docs/api/spanruler.mdx
+++ b/website/docs/api/spanruler.mdx
@@ -13,7 +13,18 @@ The span ruler lets you add spans to [`Doc.spans`](/api/doc#spans) and/or
 usage examples, see the docs on
 [rule-based span matching](/usage/rule-based-matching#spanruler).
 
-## Assigned Attributes {id="assigned-attributes"}
+<Infobox title="Replacement of the EntityRuler" variant="warning">
+
+As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is
+implemented as a special case of the `SpanRuler` component.
+
+See the [migration guide](/api/entityruler#migrating) for differences between
+the v3 `EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler`
+component.
+
+</Infobox>
+
+## Assigned Attributes {#assigned-attributes}
 
 Matches will be saved to `Doc.spans[spans_key]` as a
 [`SpanGroup`](/api/spangroup) and/or to `Doc.ents`, where the annotation is
diff --git a/website/docs/usage/101/_architecture.mdx b/website/docs/usage/101/_architecture.mdx
index 2a63a3741fa..35c36088ab9 100644
--- a/website/docs/usage/101/_architecture.mdx
+++ b/website/docs/usage/101/_architecture.mdx
@@ -41,25 +41,27 @@ components for different language processing tasks and also allows adding
 
 ![The processing pipeline](/images/pipeline.svg)
 
-| Name                                            | Description                                                                                 |
-| ----------------------------------------------- | ------------------------------------------------------------------------------------------- |
-| [`AttributeRuler`](/api/attributeruler)         | Set token attributes using matcher rules.                                                   |
-| [`DependencyParser`](/api/dependencyparser)     | Predict syntactic dependencies.                                                             |
-| [`EditTreeLemmatizer`](/api/edittreelemmatizer) | Predict base forms of words.                                                                |
-| [`EntityLinker`](/api/entitylinker)             | Disambiguate named entities to nodes in a knowledge base.                                   |
-| [`EntityRecognizer`](/api/entityrecognizer)     | Predict named entities, e.g. persons or products.                                           |
-| [`EntityRuler`](/api/entityruler)               | Add entity spans to the `Doc` using token-based rules or exact phrase matches.              |
-| [`Lemmatizer`](/api/lemmatizer)                 | Determine the base forms of words using rules and lookups.                                  |
-| [`Morphologizer`](/api/morphologizer)           | Predict morphological features and coarse-grained part-of-speech tags.                      |
-| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries.                                                                |
-| [`Sentencizer`](/api/sentencizer)               | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
-| [`Tagger`](/api/tagger)                         | Predict part-of-speech tags.                                                                |
-| [`TextCategorizer`](/api/textcategorizer)       | Predict categories or labels over the whole document.                                       |
-| [`Tok2Vec`](/api/tok2vec)                       | Apply a "token-to-vector" model and set its outputs.                                        |
-| [`Tokenizer`](/api/tokenizer)                   | Segment raw text and create `Doc` objects from the words.                                   |
-| [`TrainablePipe`](/api/pipe)                    | Class that all trainable pipeline components inherit from.                                  |
-| [`Transformer`](/api/transformer)               | Use a transformer model and set its outputs.                                                |
-| [Other functions](/api/pipeline-functions)      | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                  |
+| Component name         | Component class                                      | Description                                                                                 |
+| ---------------------- | ---------------------------------------------------- | ------------------------------------------------------------------------------------------- |
+| `attribute_ruler`      | [`AttributeRuler`](/api/attributeruler)              | Set token attributes using matcher rules.                                                   |
+| `entity_linker`        | [`EntityLinker`](/api/entitylinker)                  | Disambiguate named entities to nodes in a knowledge base.                                   |
+| `entity_ruler`         | [`SpanRuler`](/api/spanruler)                        | Add entity spans to the `Doc` using token-based rules or exact phrase matches.              |
+| `lemmatizer`           | [`Lemmatizer`](/api/lemmatizer)                      | Determine the base forms of words using rules and lookups.                                  |
+| `morphologizer`        | [`Morphologizer`](/api/morphologizer)                | Predict morphological features and coarse-grained part-of-speech tags.                      |
+| `ner`                  | [`EntityRecognizer`](/api/entityrecognizer)          | Predict named entities, e.g. persons or products.                                           |
+| `parser`               | [`DependencyParser`](/api/dependencyparser)          | Predict syntactic dependencies.                                                             |
+| `senter`               | [`SentenceRecognizer`](/api/sentencerecognizer)      | Predict sentence boundaries.                                                                |
+| `sentencizer`          | [`Sentencizer`](/api/sentencizer)                    | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
+| `span_ruler`           | [`SpanRuler`](/api/spanruler)                        | Add spans to the `Doc` using token-based rules or exact phrase matches.                     |
+| `tagger`               | [`Tagger`](/api/tagger)                              | Predict part-of-speech tags.                                                                |
+| `textcat`              | [`TextCategorizer`](/api/textcategorizer)            | Predict exactly one category or label over a whole document.                                |
+| `textcat_multilabel`   | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Predict 0, 1 or more categories or labels over a whole document.                            |
+| `tok2vec`              | [`Tok2Vec`](/api/tok2vec)                            | Apply a "token-to-vector" model and set its outputs.                                        |
+| `tokenizer`            | [`Tokenizer`](/api/tokenizer)                        | Segment raw text and create `Doc` objects from the words.                                   |
+| `trainable_lemmatizer` | [`EditTreeLemmatizer`](/api/edittreelemmatizer)      | Predict base forms of words.                                                                |
+| `transformer`          | [`Transformer`](/api/transformer)                    | Use a transformer model and set its outputs.                                                |
+| -                      | [`TrainablePipe`](/api/pipe)                         | Class that all trainable pipeline components inherit from.                                  |
+| -                      | [Other functions](/api/pipeline-functions)           | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                  |
 
 ### Matchers {id="architecture-matchers"}
 
diff --git a/website/docs/usage/101/_pipelines.mdx b/website/docs/usage/101/_pipelines.mdx
index 315291762ff..e5a08c5e424 100644
--- a/website/docs/usage/101/_pipelines.mdx
+++ b/website/docs/usage/101/_pipelines.mdx
@@ -51,9 +51,9 @@ example, a custom lemmatizer may need the part-of-speech tags assigned, so it'll
 only work if it's added after the tagger. The parser will respect pre-defined
 sentence boundaries, so if a previous component in the pipeline sets them, its
 dependency predictions may be different. Similarly, it matters if you add the
-[`EntityRuler`](/api/entityruler) before or after the statistical entity
-recognizer: if it's added before, the entity recognizer will take the existing
-entities into account when making predictions. The
+[`SpanRuler`](/api/spanruler) before or after the statistical entity recognizer:
+if it's added before and it is writing to `doc.ents`, then the entity recognizer
+will take those existing entities into account when making predictions. The
 [`EntityLinker`](/api/entitylinker), which resolves named entities to knowledge
 base IDs, should be preceded by a pipeline component that recognizes entities
 such as the [`EntityRecognizer`](/api/entityrecognizer).
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index 3e58b251dec..ec93aee2cf3 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -297,13 +297,14 @@ available pipeline components and component functions.
 > ruler = nlp.add_pipe("entity_ruler")
 > ```
 
-| String name            | Component                                            | Description                                                                               |
+| Component name         | Component class                                      | Description                                                                               |
 | ---------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- |
 | `tagger`               | [`Tagger`](/api/tagger)                              | Assign part-of-speech-tags.                                                               |
 | `parser`               | [`DependencyParser`](/api/dependencyparser)          | Assign dependency labels.                                                                 |
 | `ner`                  | [`EntityRecognizer`](/api/entityrecognizer)          | Assign named entities.                                                                    |
 | `entity_linker`        | [`EntityLinker`](/api/entitylinker)                  | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
-| `entity_ruler`         | [`EntityRuler`](/api/entityruler)                    | Assign named entities based on pattern rules and dictionaries.                            |
+| `span_ruler`           | [`SpanRuler`](/api/spanruler)                        | Assign spans based on pattern rules and dictionaries.                                     |
+| `entity_ruler`         | [`SpanRuler`](/api/spanruler)                        | Assign named entities based on pattern rules and dictionaries.                            |
 | `textcat`              | [`TextCategorizer`](/api/textcategorizer)            | Assign text categories: exactly one category is predicted per document.                   |
 | `textcat_multilabel`   | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document.   |
 | `lemmatizer`           | [`Lemmatizer`](/api/lemmatizer)                      | Assign base forms to words using rules and lookups.                                       |
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index c90172b4325..86220440991 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -511,7 +511,7 @@ matches = matcher(doc)
 ```
 
 A very similar logic has been implemented in the built-in
-[`EntityRuler`](/api/entityruler) by the way. It also takes care of handling
+[`entity_ruler`](/api/entityruler) by the way. It also takes care of handling
 overlapping matches, which you would otherwise have to take care of yourself.
 
 > #### Tip: Visualizing matches
@@ -1305,7 +1305,7 @@ of patterns such as `{}` that match any token in the sentence.
 
 ## Rule-based entity recognition {id="entityruler",version="2.1"}
 
-The [`EntityRuler`](/api/entityruler) is a component that lets you add named
+The [`entity_ruler`](/api/entityruler) is a component that lets you add named
 entities based on pattern dictionaries, which makes it easy to combine
 rule-based and statistical named entity recognition for even more powerful
 pipelines.
@@ -1330,13 +1330,12 @@ pattern. The entity ruler accepts two types of patterns:
 
 ### Using the entity ruler {id="entityruler-usage"}
 
-The [`EntityRuler`](/api/entityruler) is a pipeline component that's typically
-added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is
-called on a text, it will find matches in the `doc` and add them as entities to
-the `doc.ents`, using the specified pattern label as the entity label. If any
-matches were to overlap, the pattern matching most tokens takes priority. If
-they also happen to be equally long, then the match occurring first in the `Doc`
-is chosen.
+The `entity_ruler` is a pipeline component that's typically added via
+[`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is called on a
+text, it will find matches in the `doc` and add them as entities to `doc.ents`,
+using the specified pattern label as the entity label. If any matches were to
+overlap, the pattern matching most tokens takes priority. If they also happen to
+be equally long, then the match occurring first in the `Doc` is chosen.
 
 ```python {executable="true"}
 from spacy.lang.en import English
@@ -1372,7 +1371,7 @@ doc = nlp("MyCorp Inc. is a company in the U.S.")
 print([(ent.text, ent.label_) for ent in doc.ents])
 ```
 
-#### Validating and debugging EntityRuler patterns {id="entityruler-pattern-validation",version="2.1.8"}
+#### Validating and debugging entity ruler patterns {#entityruler-pattern-validation new="2.1.8"}
 
 The entity ruler can validate patterns against a JSON schema with the config
 setting `"validate"`. See details under
@@ -1384,9 +1383,9 @@ ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
 
 ### Adding IDs to patterns {id="entityruler-ent-ids",version="2.2.2"}
 
-The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each
-pattern. Using the `id` attribute allows multiple patterns to be associated with
-the same entity.
+The [`entity_ruler`](/api/entityruler) can also accept an `id` attribute for
+each pattern. Using the `id` attribute allows multiple patterns to be associated
+with the same entity.
 
 ```python {executable="true"}
 from spacy.lang.en import English
@@ -1405,10 +1404,10 @@ doc2 = nlp("Apple is opening its first big office in San Fran.")
 print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
 ```
 
-If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
-patterns, the `id_` property of the matched entity is set to the `id` given
-in the patterns. So in the example above it's easy to identify that "San
-Francisco" and "San Fran" are both the same entity.
+If the `id` attribute is included in the [`entity_ruler`](/api/entityruler)
+patterns, the `id_` property of the matched entity is set to the `id` given in
+the patterns. So in the example above it's easy to identify that "San Francisco"
+and "San Fran" are both the same entity.
 
 ### Using pattern files {id="entityruler-files"}
 
@@ -1431,13 +1430,13 @@ new_ruler = nlp.add_pipe("entity_ruler").from_disk("./patterns.jsonl")
 
 If you're using the [Prodigy](https://prodi.gy) annotation tool, you might
 recognize these pattern files from bootstrapping your named entity and text
-classification labelling. The patterns for the `EntityRuler` follow the same
+classification labelling. The patterns for the `entity_ruler` follow the same
 syntax, so you can use your existing Prodigy pattern files in spaCy, and vice
 versa.
 
 </Infobox>
 
-When you save out an `nlp` object that has an `EntityRuler` added to its
+When you save out an `nlp` object that has an `entity_ruler` added to its
 pipeline, its patterns are automatically exported to the pipeline directory:
 
 ```python
@@ -1460,9 +1459,9 @@ rules included!
 
 When using a large amount of **phrase patterns** (roughly > 10000) it's useful
 to understand how the `add_patterns` function of the entity ruler works. For
-each **phrase pattern**, the EntityRuler calls the nlp object to construct a doc
-object. This happens in case you try to add the EntityRuler at the end of an
-existing pipeline with, for example, a POS tagger and want to extract matches
+each **phrase pattern**, the entity ruler calls the nlp object to construct a
+doc object. This happens in case you try to add the entity ruler at the end of
+an existing pipeline with, for example, a POS tagger and want to extract matches
 based on the pattern's POS signature. In this case you would pass a config value
 of `"phrase_matcher_attr": "POS"` for the entity ruler.
 
diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx
index b44bd86ed06..97ae3c5e573 100644
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@@ -187,13 +187,13 @@ the data to and from a JSON file.
 
 > #### Real-world example
 >
-> To see custom serialization methods in action, check out the new
-> [`EntityRuler`](/api/entityruler) component and its
-> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the
+> To see custom serialization methods in action, check out the
+> [`SpanRuler`](/api/spanruler) component and its
+> [source](%%GITHUB_SPACY/spacy/pipeline/span_ruler.py). Patterns added to the
 > component will be saved to a `.jsonl` file if the pipeline is serialized to
 > disk, and to a bytestring if the pipeline is serialized to bytes. This allows
-> saving out a pipeline with a rule-based entity recognizer and including all
-> rules _with_ the component data.
+> saving out a pipeline with rule-based components _with_ all the component
+> data.
 
 ```python {highlight="16-23,25-30"}
 import json
diff --git a/website/docs/usage/training.mdx b/website/docs/usage/training.mdx
index abb1b9cfd91..eda3f355f1a 100644
--- a/website/docs/usage/training.mdx
+++ b/website/docs/usage/training.mdx
@@ -421,7 +421,7 @@ your components during training, and the most common scenarios are:
 2. Update an existing **trained component** with more examples.
 3. Include an existing trained component without updating it.
 4. Include a non-trainable component, like a rule-based
-   [`EntityRuler`](/api/entityruler) or [`Sentencizer`](/api/sentencizer), or a
+   [`SpanRuler`](/api/spanruler) or [`Sentencizer`](/api/sentencizer), or a
    fully [custom component](/usage/processing-pipelines#custom-components).
 
 If a component block defines a `factory`, spaCy will look it up in the

From 3b08cd07a6869bb95bb830841a2940b428fd474f Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Wed, 23 Nov 2022 13:09:32 +0100
Subject: [PATCH 134/504] Remove sentiment extension (#11722)

* remove sentiment attribute

* remove sentiment from docs

* add test for backwards compatibility

* replace from_disk with from_bytes

* Fix docs and format file

* Fix formatting
---
 spacy/lexeme.pyi                            |   1 -
 spacy/lexeme.pyx                            |  13 --
 spacy/tests/README.md                       |   2 +-
 spacy/tests/doc/test_doc_api.py             |  13 +-
 spacy/tests/doc/test_span.py                |  25 ---
 spacy/tests/matcher/test_matcher_api.py     |   3 -
 spacy/tokens/doc.pxd                        |   2 -
 spacy/tokens/doc.pyi                        |   1 -
 spacy/tokens/doc.pyx                        |   5 -
 spacy/tokens/span.pyi                       |   2 -
 spacy/tokens/span.pyx                       |  10 --
 spacy/tokens/token.pyi                      |   2 -
 spacy/tokens/token.pyx                      |   8 -
 website/docs/api/doc.mdx                    |   2 -
 website/docs/api/lexeme.md                  | 163 ++++++++++++++++++++
 website/docs/api/span.mdx                   |   1 -
 website/docs/api/token.mdx                  |   1 -
 website/docs/usage/processing-pipelines.mdx |   2 +-
 website/docs/usage/rule-based-matching.mdx  |  16 +-
 19 files changed, 185 insertions(+), 87 deletions(-)
 create mode 100644 website/docs/api/lexeme.md

diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi
index 9980b9fcefa..fb937d7b998 100644
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@@ -19,7 +19,6 @@ class Lexeme:
     def vector_norm(self) -> float: ...
     vector: Floats1d
     rank: int
-    sentiment: float
     @property
     def orth_(self) -> str: ...
     @property
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index f803d5e9394..3e63afa34ba 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -191,19 +191,6 @@ cdef class Lexeme:
         def __set__(self, value):
             self.c.id = value
 
-    property sentiment:
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the lexeme."""
-        def __get__(self):
-            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
-            return sentiment_table.get(self.c.orth, 0.0)
-
-        def __set__(self, float x):
-            if "lexeme_sentiment" not in self.vocab.lookups:
-                self.vocab.lookups.add_table("lexeme_sentiment")
-            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
-            sentiment_table[self.c.orth] = x
-
     @property
     def orth_(self):
         """RETURNS (str): The original verbatim text of the lexeme
diff --git a/spacy/tests/README.md b/spacy/tests/README.md
index 82fabcc778b..f3c96a39e7c 100644
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@@ -40,7 +40,7 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
 
 To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:
 
-- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
+- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email`.
 - If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
 - Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
 - Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 73544c51a4f..946910b29e1 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -389,9 +389,7 @@ def test_doc_api_serialize(en_tokenizer, text):
     assert [t.text for t in tokens] == [t.text for t in new_tokens]
     assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
 
-    new_tokens = Doc(tokens.vocab).from_bytes(
-        tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"]
-    )
+    new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
     assert tokens.text == new_tokens.text
     assert [t.text for t in tokens] == [t.text for t in new_tokens]
     assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
@@ -999,3 +997,12 @@ def test_doc_spans_setdefault(en_tokenizer):
     assert len(doc.spans["key2"]) == 1
     doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]]))
     assert len(doc.spans["key3"]) == 2
+
+
+def test_doc_sentiment_from_bytes_v3_to_v4():
+    """Test if a doc with sentiment attribute created in v3.x works with '.from_bytes' in v4.x without throwing errors. The sentiment attribute was removed in v4"""
+    doc_bytes = b"\x89\xa4text\xa5happy\xaaarray_head\x9fGQACKOLMN\xcd\x01\xc4\xcd\x01\xc6I\xcd\x01\xc5JP\xaaarray_body\x85\xc4\x02nd\xc3\xc4\x04type\xa3<u8\xc4\x04kind\xc4\x00\xc4\x05shape\x92\x01\x0f\xc4\x04data\xc4x\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xa4\x9a\xd3\x17\xca\xf0b\x03\xa4\x9a\xd3\x17\xca\xf0b\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\xa9sentiment\xcb?\xf0\x00\x00\x00\x00\x00\x00\xa6tensor\x85\xc4\x02nd\xc3\xc4\x04type\xa3<f4\xc4\x04kind\xc4\x00\xc4\x05shape\x91\x00\xc4\x04data\xc4\x00\xa4cats\x80\xa5spans\xc4\x01\x90\xa7strings\x92\xa0\xa5happy\xb2has_unknown_spaces\xc2"
+    doc = Doc(Vocab()).from_bytes(doc_bytes)
+    assert doc.text == "happy"
+    with pytest.raises(AttributeError):
+        doc.sentiment == 1.0
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index ab8538b17dc..74874624888 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -306,31 +306,6 @@ def test_span_similarity_match():
         assert span1[:1].similarity(doc.vocab["a"]) == 1.0
 
 
-def test_spans_default_sentiment(en_tokenizer):
-    """Test span.sentiment property's default averaging behaviour"""
-    text = "good stuff bad stuff"
-    tokens = en_tokenizer(text)
-    tokens.vocab[tokens[0].text].sentiment = 3.0
-    tokens.vocab[tokens[2].text].sentiment = -2.0
-    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
-    assert doc[:2].sentiment == 3.0 / 2
-    assert doc[-2:].sentiment == -2.0 / 2
-    assert doc[:-1].sentiment == (3.0 + -2) / 3.0
-
-
-def test_spans_override_sentiment(en_tokenizer):
-    """Test span.sentiment property's default averaging behaviour"""
-    text = "good stuff bad stuff"
-    tokens = en_tokenizer(text)
-    tokens.vocab[tokens[0].text].sentiment = 3.0
-    tokens.vocab[tokens[2].text].sentiment = -2.0
-    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
-    doc.user_span_hooks["sentiment"] = lambda span: 10.0
-    assert doc[:2].sentiment == 10.0
-    assert doc[-2:].sentiment == 10.0
-    assert doc[:-1].sentiment == 10.0
-
-
 def test_spans_are_hashable(en_tokenizer):
     """Test spans can be hashed."""
     text = "good stuff bad stuff"
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 106a00b3011..ecb4385dd90 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -51,8 +51,6 @@ def test_matcher_from_usage_docs(en_vocab):
 
     def label_sentiment(matcher, doc, i, matches):
         match_id, start, end = matches[i]
-        if doc.vocab.strings[match_id] == "HAPPY":
-            doc.sentiment += 0.1
         span = doc[start:end]
         with doc.retokenize() as retokenizer:
             retokenizer.merge(span)
@@ -62,7 +60,6 @@ def label_sentiment(matcher, doc, i, matches):
     matcher = Matcher(en_vocab)
     matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
     matcher(doc)
-    assert doc.sentiment != 0
     assert doc[1].norm_ == "happy emoji"
 
 
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 5e8975ed337..9fb6a72c87f 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -48,8 +48,6 @@ cdef class Doc:
 
     cdef TokenC* c
 
-    cdef public float sentiment
-
     cdef public dict activations
 
     cdef public dict user_hooks
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 5fda6f2f789..97c3f69f430 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -40,7 +40,6 @@ class Doc:
     spans: SpanGroups
     max_length: int
     length: int
-    sentiment: float
     activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
     cats: Dict[str, float]
     user_hooks: Dict[str, Callable[..., Any]]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 497656b6570..48def8c9544 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -279,7 +279,6 @@ cdef class Doc:
         self.c = data_start + PADDING
         self.max_length = size
         self.length = 0
-        self.sentiment = 0.0
         self.cats = {}
         self.activations = {}
         self.user_hooks = {}
@@ -1315,7 +1314,6 @@ cdef class Doc:
         other.tensor = copy.deepcopy(self.tensor)
         other.cats = copy.deepcopy(self.cats)
         other.user_data = copy.deepcopy(self.user_data)
-        other.sentiment = self.sentiment
         other.has_unknown_spaces = self.has_unknown_spaces
         other.user_hooks = dict(self.user_hooks)
         other.user_token_hooks = dict(self.user_token_hooks)
@@ -1415,7 +1413,6 @@ cdef class Doc:
             "text": lambda: self.text,
             "array_head": lambda: array_head,
             "array_body": lambda: self.to_array(array_head),
-            "sentiment": lambda: self.sentiment,
             "tensor": lambda: self.tensor,
             "cats": lambda: self.cats,
             "spans": lambda: self.spans.to_bytes(),
@@ -1451,8 +1448,6 @@ cdef class Doc:
             for key, value in zip(user_data_keys, user_data_values):
                 self.user_data[key] = value
         cdef int i, start, end, has_space
-        if "sentiment" not in exclude and "sentiment" in msg:
-            self.sentiment = msg["sentiment"]
         if "tensor" not in exclude and "tensor" in msg:
             self.tensor = msg["tensor"]
         if "cats" not in exclude and "cats" in msg:
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index a6731d1c2d4..ae4a6209e7e 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -89,8 +89,6 @@ class Span:
     @property
     def tensor(self) -> FloatsXd: ...
     @property
-    def sentiment(self) -> float: ...
-    @property
     def text(self) -> str: ...
     @property
     def text_with_ws(self) -> str: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index b212b4c4303..73f555747eb 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -594,16 +594,6 @@ cdef class Span:
             return None
         return self.doc.tensor[self.start : self.end]
 
-    @property
-    def sentiment(self):
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the span.
-        """
-        if "sentiment" in self.doc.user_span_hooks:
-            return self.doc.user_span_hooks["sentiment"](self)
-        else:
-            return sum([token.sentiment for token in self]) / len(self)
-
     @property
     def text(self):
         """RETURNS (str): The original verbatim text of the span."""
diff --git a/spacy/tokens/token.pyi b/spacy/tokens/token.pyi
index 435ace52707..5c3d4d0ba2b 100644
--- a/spacy/tokens/token.pyi
+++ b/spacy/tokens/token.pyi
@@ -78,8 +78,6 @@ class Token:
     @property
     def prob(self) -> float: ...
     @property
-    def sentiment(self) -> float: ...
-    @property
     def lang(self) -> int: ...
     @property
     def idx(self) -> int: ...
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index c0cd0af42c0..3a7ce45c54a 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -309,14 +309,6 @@ cdef class Token:
         """RETURNS (float): Smoothed log probability estimate of token type."""
         return self.vocab[self.c.lex.orth].prob
 
-    @property
-    def sentiment(self):
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the token."""
-        if "sentiment" in self.doc.user_token_hooks:
-            return self.doc.user_token_hooks["sentiment"](self)
-        return self.vocab[self.c.lex.orth].sentiment
-
     @property
     def lang(self):
         """RETURNS (uint64): ID of the language of the parent document's
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index 310ce0dc88d..28757cbc45f 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -762,7 +762,6 @@ The L2 norm of the document's vector representation.
 | `user_data`                                | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                               |
 | `lang` <Tag variant="new">2.1</Tag>        | Language of the document's vocabulary. ~~int~~                                                                                                 |
 | `lang_` <Tag variant="new">2.1</Tag>       | Language of the document's vocabulary. ~~str~~                                                                                                 |
-| `sentiment`                                | The document's positivity/negativity score, if available. ~~float~~                                                                            |
 | `user_hooks`                               | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                      |
 | `user_token_hooks`                         | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                              |
 | `user_span_hooks`                          | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                               |
@@ -786,7 +785,6 @@ serialization by passing in the string names via the `exclude` argument.
 | Name               | Description                                   |
 | ------------------ | --------------------------------------------- |
 | `text`             | The value of the `Doc.text` attribute.        |
-| `sentiment`        | The value of the `Doc.sentiment` attribute.   |
 | `tensor`           | The value of the `Doc.tensor` attribute.      |
 | `user_data`        | The value of the `Doc.user_data` dictionary.  |
 | `user_data_keys`   | The keys of the `Doc.user_data` dictionary.   |
diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md
new file mode 100644
index 00000000000..db1aba7aaec
--- /dev/null
+++ b/website/docs/api/lexeme.md
@@ -0,0 +1,163 @@
+---
+title: Lexeme
+teaser: An entry in the vocabulary
+tag: class
+source: spacy/lexeme.pyx
+---
+
+A `Lexeme` has no string context – it's a word type, as opposed to a word token.
+It therefore has no part-of-speech tag, dependency parse, or lemma (if
+lemmatization depends on the part-of-speech tag).
+
+## Lexeme.\_\_init\_\_ {#init tag="method"}
+
+Create a `Lexeme` object.
+
+| Name    | Description                        |
+| ------- | ---------------------------------- |
+| `vocab` | The parent vocabulary. ~~Vocab~~   |
+| `orth`  | The orth id of the lexeme. ~~int~~ |
+
+## Lexeme.set_flag {#set_flag tag="method"}
+
+Change the value of a boolean flag.
+
+> #### Example
+>
+> ```python
+> COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
+> nlp.vocab["spaCy"].set_flag(COOL_FLAG, True)
+> ```
+
+| Name      | Description                                  |
+| --------- | -------------------------------------------- |
+| `flag_id` | The attribute ID of the flag to set. ~~int~~ |
+| `value`   | The new value of the flag. ~~bool~~          |
+
+## Lexeme.check_flag {#check_flag tag="method"}
+
+Check the value of a boolean flag.
+
+> #### Example
+>
+> ```python
+> is_my_library = lambda text: text in ["spaCy", "Thinc"]
+> MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
+> assert nlp.vocab["spaCy"].check_flag(MY_LIBRARY) == True
+> ```
+
+| Name        | Description                                    |
+| ----------- | ---------------------------------------------- |
+| `flag_id`   | The attribute ID of the flag to query. ~~int~~ |
+| **RETURNS** | The value of the flag. ~~bool~~                |
+
+## Lexeme.similarity {#similarity tag="method" model="vectors"}
+
+Compute a semantic similarity estimate. Defaults to cosine over vectors.
+
+> #### Example
+>
+> ```python
+> apple = nlp.vocab["apple"]
+> orange = nlp.vocab["orange"]
+> apple_orange = apple.similarity(orange)
+> orange_apple = orange.similarity(apple)
+> assert apple_orange == orange_apple
+> ```
+
+| Name        | Description                                                                                                                      |
+| ----------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| other       | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ |
+| **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~                                                                     |
+
+## Lexeme.has_vector {#has_vector tag="property" model="vectors"}
+
+A boolean value indicating whether a word vector is associated with the lexeme.
+
+> #### Example
+>
+> ```python
+> apple = nlp.vocab["apple"]
+> assert apple.has_vector
+> ```
+
+| Name        | Description                                             |
+| ----------- | ------------------------------------------------------- |
+| **RETURNS** | Whether the lexeme has a vector data attached. ~~bool~~ |
+
+## Lexeme.vector {#vector tag="property" model="vectors"}
+
+A real-valued meaning representation.
+
+> #### Example
+>
+> ```python
+> apple = nlp.vocab["apple"]
+> assert apple.vector.dtype == "float32"
+> assert apple.vector.shape == (300,)
+> ```
+
+| Name        | Description                                                                                      |
+| ----------- | ------------------------------------------------------------------------------------------------ |
+| **RETURNS** | A 1-dimensional array representing the lexeme's vector. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
+
+## Lexeme.vector_norm {#vector_norm tag="property" model="vectors"}
+
+The L2 norm of the lexeme's vector representation.
+
+> #### Example
+>
+> ```python
+> apple = nlp.vocab["apple"]
+> pasta = nlp.vocab["pasta"]
+> apple.vector_norm  # 7.1346845626831055
+> pasta.vector_norm  # 7.759851932525635
+> assert apple.vector_norm != pasta.vector_norm
+> ```
+
+| Name        | Description                                         |
+| ----------- | --------------------------------------------------- |
+| **RETURNS** | The L2 norm of the vector representation. ~~float~~ |
+
+## Attributes {#attributes}
+
+| Name                                         | Description                                                                                                                                                                                                                                                          |
+| -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                      | The lexeme's vocabulary. ~~Vocab~~                                                                                                                                                                                                                                   |
+| `text`                                       | Verbatim text content. ~~str~~                                                                                                                                                                                                                                       |
+| `orth`                                       | ID of the verbatim text content. ~~int~~                                                                                                                                                                                                                             |
+| `orth_`                                      | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. ~~str~~                                                                                                                                                 |
+| `rank`                                       | Sequential ID of the lexeme's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                |
+| `flags`                                      | Container of the lexeme's binary flags. ~~int~~                                                                                                                                                                                                                      |
+| `norm`                                       | The lexeme's norm, i.e. a normalized form of the lexeme text. ~~int~~                                                                                                                                                                                                |
+| `norm_`                                      | The lexeme's norm, i.e. a normalized form of the lexeme text. ~~str~~                                                                                                                                                                                                |
+| `lower`                                      | Lowercase form of the word. ~~int~~                                                                                                                                                                                                                                  |
+| `lower_`                                     | Lowercase form of the word. ~~str~~                                                                                                                                                                                                                                  |
+| `shape`                                      | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
+| `shape_`                                     | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
+| `prefix`                                     | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~                                                                                                                                                                                            |
+| `prefix_`                                    | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~                                                                                                                                                                                            |
+| `suffix`                                     | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~                                                                                                                                                                                              |
+| `suffix_`                                    | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~                                                                                                                                                                                            |
+| `is_alpha`                                   | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~                                                                                                                                                                    |
+| `is_ascii`                                   | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~                                                                                                                                                     |
+| `is_digit`                                   | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~                                                                                                                                                                                   |
+| `is_lower`                                   | Is the lexeme in lowercase? Equivalent to `lexeme.text.islower()`. ~~bool~~                                                                                                                                                                                          |
+| `is_upper`                                   | Is the lexeme in uppercase? Equivalent to `lexeme.text.isupper()`. ~~bool~~                                                                                                                                                                                          |
+| `is_title`                                   | Is the lexeme in titlecase? Equivalent to `lexeme.text.istitle()`. ~~bool~~                                                                                                                                                                                          |
+| `is_punct`                                   | Is the lexeme punctuation? ~~bool~~                                                                                                                                                                                                                                  |
+| `is_left_punct`                              | Is the lexeme a left punctuation mark, e.g. `(`? ~~bool~~                                                                                                                                                                                                            |
+| `is_right_punct`                             | Is the lexeme a right punctuation mark, e.g. `)`? ~~bool~~                                                                                                                                                                                                           |
+| `is_space`                                   | Does the lexeme consist of whitespace characters? Equivalent to `lexeme.text.isspace()`. ~~bool~~                                                                                                                                                                    |
+| `is_bracket`                                 | Is the lexeme a bracket? ~~bool~~                                                                                                                                                                                                                                    |
+| `is_quote`                                   | Is the lexeme a quotation mark? ~~bool~~                                                                                                                                                                                                                             |
+| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the lexeme a currency symbol? ~~bool~~                                                                                                                                                                                                                            |
+| `like_url`                                   | Does the lexeme resemble a URL? ~~bool~~                                                                                                                                                                                                                             |
+| `like_num`                                   | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~                                                                                                                                                                                          |
+| `like_email`                                 | Does the lexeme resemble an email address? ~~bool~~                                                                                                                                                                                                                  |
+| `is_oov`                                     | Is the lexeme out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~                                                                                                                                                                                      |
+| `is_stop`                                    | Is the lexeme part of a "stop list"? ~~bool~~                                                                                                                                                                                                                        |
+| `lang`                                       | Language of the parent vocabulary. ~~int~~                                                                                                                                                                                                                           |
+| `lang_`                                      | Language of the parent vocabulary. ~~str~~                                                                                                                                                                                                                           |
+| `prob`                                       | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                 |
+| `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index 5e7495f17ca..1774a298ff2 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -568,5 +568,4 @@ overlaps with will be returned.
 | `ent_id_`                               | Alias for `id_`: the span's ID. ~~str~~                                                                                       |
 | `id`                                    | The hash value of the span's ID. ~~int~~                                                                                      |
 | `id_`                                   | The span's ID. ~~str~~                                                                                                        |
-| `sentiment`                             | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
 | `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
diff --git a/website/docs/api/token.mdx b/website/docs/api/token.mdx
index 12b99394350..16d421c12f4 100644
--- a/website/docs/api/token.mdx
+++ b/website/docs/api/token.mdx
@@ -470,7 +470,6 @@ The L2 norm of the token's vector representation.
 | `lang_`                                      | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
 | `prob`                                       | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
 | `idx`                                        | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
-| `sentiment`                                  | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
 | `lex_id`                                     | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `rank`                                       | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index ec93aee2cf3..c0fc4207046 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -1388,7 +1388,7 @@ separation and makes it easier to ensure backwards compatibility. For example,
 if you've implemented your own `.coref` property and spaCy claims it one day,
 it'll break your code. Similarly, just by looking at the code, you'll
 immediately know what's built-in and what's custom – for example,
-`doc.sentiment` is spaCy, while `doc._.sent_score` isn't.
+`doc.lang` is spaCy, while `doc._.language` isn't.
 
 </Accordion>
 
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 86220440991..8469d587ed1 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -811,6 +811,9 @@ whitespace, making them easy to match as well.
 ```python {executable="true"}
 from spacy.lang.en import English
 from spacy.matcher import Matcher
+from spacy.tokens import Doc
+
+Doc.set_extension("sentiment", default=0.0)
 
 nlp = English()  # We only want the tokenizer, so no need to load a pipeline
 matcher = Matcher(nlp.vocab)
@@ -826,9 +829,9 @@ neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]
 def label_sentiment(matcher, doc, i, matches):
     match_id, start, end = matches[i]
     if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
-        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
+        doc._.sentiment += 0.1  # Add 0.1 for positive sentiment
     elif doc.vocab.strings[match_id] == "SAD":
-        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
+        doc._.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
 
 matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)  # Add positive pattern
 matcher.add("SAD", neg_patterns, on_match=label_sentiment)  # Add negative pattern
@@ -857,17 +860,18 @@ is "Smiling Face With Heart-Eyes". Assigning it to a
 the emoji span will make it available as `span._.emoji_desc`.
 
 ```python
-import emoji  # Installation: pip install emoji
-from spacy.tokens import Span  # Get the global Span object
+from emojipedia import Emojipedia  # Installation: pip install emojipedia
+from spacy.tokens import Doc, Span  # Get the global Doc and Span object
 
 Span.set_extension("emoji_desc", default=None)  # Register the custom attribute
+Doc.set_extension("sentiment", default=0.0)
 
 def label_sentiment(matcher, doc, i, matches):
     match_id, start, end = matches[i]
     if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
-        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
+        doc._.sentiment += 0.1  # Add 0.1 for positive sentiment
     elif doc.vocab.strings[match_id] == "SAD":
-        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
+        doc._.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
     span = doc[start:end]
     # Verify if it is an emoji and set the extension attribute correctly.
     if emoji.is_emoji(span[0].text):

From 324b1408a1d4df9c54f5fe7c4a07303c4354e36e Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 5 Dec 2022 08:57:24 +0100
Subject: [PATCH 135/504] prettier formatting

---
 website/docs/api/cli.mdx                    | 30 ++++++++++-----------
 website/docs/usage/processing-pipelines.mdx |  4 +--
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 950d98c1f68..47028f4a2e7 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1343,21 +1343,21 @@ be provided.
 > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
 > ```
 
-| Name                     | Description                                                                                                                                                                          |
-| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                  | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`              | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
-| `pipe_name`              | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
-| `threshold_key`          | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
-| `scores_key`             | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
-| `--n_trials`, `-n`       | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
-| `--code`, `-c`           | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gpu-id`, `-g`         | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--gold-preproc`, `-G`   | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                     |
-| `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-
-## assemble {id="assemble",tag="command"}
+| Name                    | Description                                                                                                                                                                          |
+| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                 | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
+| `data_path`             | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
+| `pipe_name`             | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
+| `threshold_key`         | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
+| `scores_key`            | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
+| `--n_trials`, `-n`      | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
+| `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gpu-id`, `-g`        | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--gold-preproc`, `-G`  | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
+| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+
+## assemble {#assemble tag="command"}
 
 Assemble a pipeline from a config file without additional training. Expects a
 [config file](/api/data-formats#config) with all settings and hyperparameters.
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index c0fc4207046..fb5de5da102 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -1387,8 +1387,8 @@ Writing to a `._` attribute instead of to the `Doc` directly keeps a clearer
 separation and makes it easier to ensure backwards compatibility. For example,
 if you've implemented your own `.coref` property and spaCy claims it one day,
 it'll break your code. Similarly, just by looking at the code, you'll
-immediately know what's built-in and what's custom – for example,
-`doc.lang` is spaCy, while `doc._.language` isn't.
+immediately know what's built-in and what's custom – for example, `doc.lang` is
+spaCy, while `doc._.language` isn't.
 
 </Accordion>
 

From 89a6f0510b59cbacc53ccff076b638e58f794a3f Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 5 Dec 2022 17:43:23 +0900
Subject: [PATCH 136/504] Switch ubuntu-latest to ubuntu-20.04 in main tests
 (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6
---
 azure-pipelines.yml | 103 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 azure-pipelines.yml

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
new file mode 100644
index 00000000000..0f7ea91f96f
--- /dev/null
+++ b/azure-pipelines.yml
@@ -0,0 +1,103 @@
+trigger:
+  batch: true
+  branches:
+    include:
+      - "*"
+    exclude:
+      - "spacy.io"
+      - "nightly.spacy.io"
+      - "v2.spacy.io"
+  paths:
+    exclude:
+      - "website/*"
+      - "*.md"
+      - ".github/workflows/*"
+pr:
+  paths:
+    exclude:
+      - "*.md"
+      - "website/docs/*"
+      - "website/src/*"
+      - ".github/workflows/*"
+
+jobs:
+  # Perform basic checks for most important errors (syntax etc.) Uses the config
+  # defined in .flake8 and overwrites the selected codes.
+  - job: "Validate"
+    pool:
+      vmImage: "ubuntu-latest"
+    steps:
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: "3.7"
+      - script: |
+          pip install flake8==5.0.4
+          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
+        displayName: "flake8"
+
+  - job: "Test"
+    dependsOn: "Validate"
+    strategy:
+      matrix:
+        # We're only running one platform per Python version to speed up builds
+        Python36Linux:
+          imageName: "ubuntu-20.04"
+          python.version: "3.6"
+        #        Python36Windows:
+        #          imageName: "windows-latest"
+        #          python.version: "3.6"
+        #        Python36Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.6"
+        #        Python37Linux:
+        #          imageName: "ubuntu-20.04"
+        #          python.version: "3.7"
+        Python37Windows:
+          imageName: "windows-latest"
+          python.version: "3.7"
+        #        Python37Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.7"
+        #        Python38Linux:
+        #          imageName: "ubuntu-latest"
+        #          python.version: "3.8"
+        #        Python38Windows:
+        #          imageName: "windows-latest"
+        #          python.version: "3.8"
+        Python38Mac:
+          imageName: "macos-latest"
+          python.version: "3.8"
+        Python39Linux:
+          imageName: "ubuntu-latest"
+          python.version: "3.9"
+        #        Python39Windows:
+        #          imageName: "windows-latest"
+        #          python.version: "3.9"
+        #        Python39Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.9"
+        #        Python310Linux:
+        #          imageName: "ubuntu-latest"
+        #          python.version: "3.10"
+        Python310Windows:
+          imageName: "windows-latest"
+          python.version: "3.10"
+        #        Python310Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.10"
+        Python311Linux:
+          imageName: 'ubuntu-latest'
+          python.version: '3.11'
+        Python311Windows:
+          imageName: 'windows-latest'
+          python.version: '3.11'
+        Python311Mac:
+          imageName: 'macos-latest'
+          python.version: '3.11'
+      maxParallel: 4
+    pool:
+      vmImage: $(imageName)
+    steps:
+      - template: .github/azure-steps.yml
+        parameters:
+          python_version: '$(python.version)'

From 28df8e65f46d4abd1a2d651be0a4e2065a3d7146 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 8 Dec 2022 19:43:52 +0900
Subject: [PATCH 137/504] Remove all references to "begin_training" (#11943)

When v3 was released, `begin_training` was renamed to `initialize`.
There were warnings in the code and docs about that. This PR removes
them.
---
 spacy/errors.py                           |  7 -------
 spacy/language.py                         |  9 ---------
 spacy/pipeline/pipe.pyx                   |  7 -------
 spacy/tests/pipeline/test_pipe_methods.py | 11 -----------
 website/docs/api/dependencyparser.mdx     |  6 ------
 website/docs/api/entitylinker.mdx         |  6 ------
 website/docs/api/entityrecognizer.mdx     |  6 ------
 website/docs/api/language.mdx             |  9 ---------
 website/docs/api/pipe.mdx                 |  6 ------
 website/docs/api/tagger.mdx               |  6 ------
 website/docs/api/textcategorizer.mdx      |  6 ------
 11 files changed, 79 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 965c92066bc..454e71f987c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -132,13 +132,6 @@ class Warnings(metaclass=ErrorsWithCodes):
             "and make it independent. For example, `replace_listeners = "
             "[\"model.tok2vec\"]` See the documentation for details: "
             "https://spacy.io/usage/training#config-components-listeners")
-    W088 = ("The pipeline component {name} implements a `begin_training` "
-            "method, which won't be called by spaCy. As of v3.0, `begin_training` "
-            "has been renamed to `initialize`, so you likely want to rename the "
-            "component method. See the documentation for details: "
-            "https://spacy.io/api/language#initialize")
-    W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed "
-            "to `nlp.initialize`.")
     W090 = ("Could not locate any {format} files in path '{path}'.")
     W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
     W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
diff --git a/spacy/language.py b/spacy/language.py
index 18d20c93932..a47cc5df454 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1277,15 +1277,6 @@ def get_grads(key, W, dW):
             sgd(key, W, dW)  # type: ignore[call-arg, misc]
         return losses
 
-    def begin_training(
-        self,
-        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
-        *,
-        sgd: Optional[Optimizer] = None,
-    ) -> Optimizer:
-        warnings.warn(Warnings.W089, DeprecationWarning)
-        return self.initialize(get_examples, sgd=sgd)
-
     def initialize(
         self,
         get_examples: Optional[Callable[[], Iterable[Example]]] = None,
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 72ea7e45a80..ea5fc5253d9 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -21,13 +21,6 @@ cdef class Pipe:
     DOCS: https://spacy.io/api/pipe
     """
 
-    @classmethod
-    def __init_subclass__(cls, **kwargs):
-        """Raise a warning if an inheriting class implements 'begin_training'
-         (from v2) instead of the new 'initialize' method (from v3)"""
-        if hasattr(cls, "begin_training"):
-            warnings.warn(Warnings.W088.format(name=cls.__name__))
-
     def __call__(self, Doc doc) -> Doc:
         """Apply the pipe to one document. The document is modified in place,
         and returned. This usually happens under the hood when the nlp object
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 4dd7bae16c2..9b9786f0458 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -529,17 +529,6 @@ def test_pipe_label_data_no_labels(pipe):
         assert "labels" not in get_arg_names(initialize)
 
 
-def test_warning_pipe_begin_training():
-    with pytest.warns(UserWarning, match="begin_training"):
-
-        class IncompatPipe(TrainablePipe):
-            def __init__(self):
-                ...
-
-            def begin_training(*args, **kwargs):
-                ...
-
-
 def test_pipe_methods_initialize():
     """Test that the [initialize] config reflects the components correctly."""
     nlp = Language()
diff --git a/website/docs/api/dependencyparser.mdx b/website/docs/api/dependencyparser.mdx
index a6bc48cdf74..771a00aeee1 100644
--- a/website/docs/api/dependencyparser.mdx
+++ b/website/docs/api/dependencyparser.mdx
@@ -169,12 +169,6 @@ arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index 85b872151fd..238b62a2e6d 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -200,12 +200,6 @@ knowledge base. This argument should be a function that takes a `Vocab` instance
 and creates the `KnowledgeBase`, ensuring that the strings of the knowledge base
 are synced with the current vocab.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/entityrecognizer.mdx b/website/docs/api/entityrecognizer.mdx
index c80406a5b81..1f386bbb6ff 100644
--- a/website/docs/api/entityrecognizer.mdx
+++ b/website/docs/api/entityrecognizer.mdx
@@ -165,12 +165,6 @@ arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 068e8ea7885..d5fbae05ec4 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -259,15 +259,6 @@ either in the [config](/usage/training#config), or by calling
 [`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
 the tagger or textcat).
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`. It now also takes a
-**function** that is called with no arguments and returns a sequence of
-[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse`
-objects.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx
index c2777edf07e..b387ea58654 100644
--- a/website/docs/api/pipe.mdx
+++ b/website/docs/api/pipe.mdx
@@ -152,12 +152,6 @@ network,
 setting up the label scheme based on the data. This method is typically called
 by [`Language.initialize`](/api/language#initialize).
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index 20852e8eb94..ae14df212ee 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -142,12 +142,6 @@ arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/textcategorizer.mdx b/website/docs/api/textcategorizer.mdx
index a1dfb6dd88e..5db3a409255 100644
--- a/website/docs/api/textcategorizer.mdx
+++ b/website/docs/api/textcategorizer.mdx
@@ -187,12 +187,6 @@ arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python

From 1595eafef213287d6374af855d77ce19ecc4f882 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 8 Dec 2022 19:45:52 +0900
Subject: [PATCH 138/504] Remove old model shortcuts (#11916)

* Remove old model shortcuts

* Remove error, docs warnings about shortcuts

* Fix import in util

Accidentally deleted the whole import and not just the old part...

* Change universe example to v3 style

* Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6

* Update some model loading in Universe

* Add v2 tag to neuralcoref

* Use the spacy-version feature instead of a v2 tag

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/cli/download.py         | 18 ++----------------
 spacy/errors.py               | 16 ----------------
 spacy/util.py                 |  4 +---
 website/UNIVERSE.md           |  2 +-
 website/docs/usage/models.mdx | 29 +----------------------------
 website/meta/universe.json    | 20 ++++++++++++--------
 6 files changed, 17 insertions(+), 72 deletions(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 4261fb830d9..f371d110319 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,16 +7,8 @@
 from wasabi import msg
 
 from .. import about
-from ..errors import OLD_MODEL_SHORTCUTS
-from ..util import (
-    get_minor_version,
-    is_in_interactive,
-    is_in_jupyter,
-    is_package,
-    is_prerelease_version,
-    run_command,
-)
-from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
+from ..util import is_package, get_minor_version, run_command
+from ..util import is_prerelease_version
 
 
 @app.command(
@@ -76,12 +68,6 @@ def download(
         version = components[-1]
     else:
         model_name = model
-        if model in OLD_MODEL_SHORTCUTS:
-            msg.warn(
-                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please "
-                f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
-            )
-            model_name = OLD_MODEL_SHORTCUTS[model]
         compatibility = get_compatibility()
         version = get_version(model_name, compatibility)
 
diff --git a/spacy/errors.py b/spacy/errors.py
index 454e71f987c..5f03d0eae94 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -732,13 +732,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "method in component '{name}'. If you want to use this "
             "method, make sure it's overwritten on the subclass.")
     E940 = ("Found NaN values in scores.")
-    E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
-            "model from a shortcut, which is obsolete as of spaCy v3.0. To "
-            "load the model, use its full name instead:\n\n"
-            "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
-            "models, see the models directory: https://spacy.io/models and if "
-            "you want to create a blank model, use spacy.blank: "
-            "nlp = spacy.blank(\"{name}\")")
     E942 = ("Executing `after_{name}` callback failed. Expected the function to "
             "return an initialized nlp object but got: {value}. Maybe "
             "you forgot to return the modified object in your function?")
@@ -986,15 +979,6 @@ class Errors(metaclass=ErrorsWithCodes):
              "but got '{received_type}'")
 
 
-# Deprecated model shortcuts, only used in errors and warnings
-OLD_MODEL_SHORTCUTS = {
-    "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
-    "pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
-    "nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
-    "lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
-}
-
-
 # fmt: on
 
 
diff --git a/spacy/util.py b/spacy/util.py
index 8068c4bcec9..463ac219bf5 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -66,7 +66,7 @@
 
 from .symbols import ORTH
 from .compat import cupy, CudaStream, is_windows, importlib_metadata
-from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
+from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, importlib_metadata, is_windows
 from .errors import OLD_MODEL_SHORTCUTS, Errors, Warnings
@@ -465,8 +465,6 @@ def load_model(
             return load_model_from_path(Path(name), **kwargs)  # type: ignore[arg-type]
     elif hasattr(name, "exists"):  # Path or Path-like to model data
         return load_model_from_path(name, **kwargs)  # type: ignore[arg-type]
-    if name in OLD_MODEL_SHORTCUTS:
-        raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name]))  # type: ignore[index]
     raise IOError(Errors.E050.format(name=name))
 
 
diff --git a/website/UNIVERSE.md b/website/UNIVERSE.md
index ac4e2e684fb..a9008086c95 100644
--- a/website/UNIVERSE.md
+++ b/website/UNIVERSE.md
@@ -61,7 +61,7 @@ use a linter to verify that your markup is correct.
         "import spacy",
         "import package_name",
         "",
-        "nlp = spacy.load('en')",
+        "nlp = spacy.load('en_core_web_sm')",
         "nlp.add_pipe(package_name)"
     ],
     "code_language": "python",
diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx
index 9213dead16b..e74c37e3080 100644
--- a/website/docs/usage/models.mdx
+++ b/website/docs/usage/models.mdx
@@ -337,23 +337,7 @@ The easiest way to download a trained pipeline is via spaCy's
 [`download`](/api/cli#download) command. It takes care of finding the
 best-matching package compatible with your spaCy installation.
 
-> #### Important note for v3.0
->
-> Note that as of spaCy v3.0, shortcut links like `en` that create (potentially
-> brittle) symlinks in your spaCy installation are **deprecated**. To download
-> and load an installed pipeline package, use its full name:
->
-> ```diff
-> - python -m spacy download en
-> + python -m spacy download en_core_web_sm
-> ```
->
-> ```diff
-> - nlp = spacy.load("en")
-> + nlp = spacy.load("en_core_web_sm")
-> ```
-
-```bash
+```cli
 # Download best-matching version of a package for your spaCy installation
 $ python -m spacy download en_core_web_sm
 
@@ -483,17 +467,6 @@ spacy.cli.download("en_core_web_sm")
 To load a pipeline package, use [`spacy.load`](/api/top-level#spacy.load) with
 the package name or a path to the data directory:
 
-> #### Important note for v3.0
->
-> Note that as of spaCy v3.0, shortcut links like `en` that create (potentially
-> brittle) symlinks in your spaCy installation are **deprecated**. To download
-> and load an installed pipeline package, use its full name:
->
-> ```diff
-> - python -m spacy download en
-> + python -m spacy download en_core_web_sm
-> ```
-
 ```python
 import spacy
 nlp = spacy.load("en_core_web_sm")           # load package "en_core_web_sm"
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 6278dd4899b..cb2386e1fb8 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1181,12 +1181,13 @@
             "author_links": {
                 "github": "mholtzscher"
             },
-            "category": ["pipeline"]
+            "category": ["pipeline"],
+            "spacy_version": 2
         },
         {
             "id": "spacy_cld",
             "title": "spaCy-CLD",
-            "slogan": "Add language detection to your spaCy pipeline using CLD2",
+            "slogan": "Add language detection to your spaCy v2 pipeline using CLD2",
             "description": "spaCy-CLD operates on `Doc` and `Span` spaCy objects. When called on a `Doc` or `Span`, the object is given two attributes: `languages` (a list of up to 3 language codes) and `language_scores` (a dictionary mapping language codes to confidence scores between 0 and 1).\n\nspacy-cld is a little extension that wraps the [PYCLD2](https://github.com/aboSamoor/pycld2) Python library, which in turn wraps the [Compact Language Detector 2](https://github.com/CLD2Owners/cld2) C library originally built at Google for the Chromium project. CLD2 uses character n-grams as features and a Naive Bayes classifier to identify 80+ languages from Unicode text strings (or XML/HTML). It can detect up to 3 different languages in a given document, and reports a confidence score (reported in with each language.",
             "github": "nickdavidhaynes/spacy-cld",
             "pip": "spacy_cld",
@@ -1206,7 +1207,8 @@
             "author_links": {
                 "github": "nickdavidhaynes"
             },
-            "category": ["pipeline"]
+            "category": ["pipeline"],
+            "spacy_version": 2
         },
         {
             "id": "spacy-iwnlp",
@@ -1280,7 +1282,8 @@
                 "github": "sammous"
             },
             "category": ["pipeline"],
-            "tags": ["pos", "lemmatizer", "french"]
+            "tags": ["pos", "lemmatizer", "french"],
+            "spacy_version": 2
         },
         {
             "id": "lemmy",
@@ -1474,8 +1477,8 @@
         },
         {
             "id": "neuralcoref",
-            "slogan": "State-of-the-art coreference resolution based on neural nets and spaCy",
-            "description": "This coreference resolution module is based on the super fast [spaCy](https://spacy.io/) parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. Since ✨Neuralcoref v2.0, you can train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**. Note that to use neuralcoref with spaCy > 2.1.0, you'll have to install neuralcoref from source.",
+            "slogan": "State-of-the-art coreference resolution based on neural nets and spaCy v2",
+            "description": "This coreference resolution module is based on the super fast spaCy parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. Since ✨Neuralcoref v2.0, you can train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**. Note that to use neuralcoref with spaCy > 2.1.0, you'll have to install neuralcoref from source, and v3+ is not supported.",
             "github": "huggingface/neuralcoref",
             "thumb": "https://i.imgur.com/j6FO9O6.jpg",
             "code_example": [
@@ -1496,7 +1499,8 @@
                 "github": "huggingface"
             },
             "category": ["standalone", "conversational", "models"],
-            "tags": ["coref"]
+            "tags": ["coref"],
+            "spacy_version": 2
         },
         {
             "id": "neuralcoref-vizualizer",
@@ -1572,7 +1576,7 @@
                 "import spacy",
                 "import explacy",
                 "",
-                "nlp = spacy.load('en')",
+                "nlp = spacy.load('en_core_web_sm')",
                 "explacy.print_parse_info(nlp, 'The salad was surprisingly tasty.')"
             ],
             "author": "Tyler Neylon",

From 19d902b6f20c7012a7fbbc5190abc55fb08f45b3 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 8 Dec 2022 13:24:45 +0100
Subject: [PATCH 139/504] Remove unused, experimental multi-task components
 (#11919)

* Remove experimental multi-task components

These are incomplete implementations and are not usable in their current state.

* Remove orphaned error message

* Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6

* Revert "Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)"

This reverts commit 77c0fd7b176be80e8438fa21440a85d1fe26e39b.

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
---
 setup.py                     |   1 -
 spacy/errors.py              |   2 -
 spacy/pipeline/multitask.pyx | 215 -----------------------------------
 3 files changed, 218 deletions(-)
 delete mode 100644 spacy/pipeline/multitask.pyx

diff --git a/setup.py b/setup.py
index c9b4f7171e3..a80016ea9ea 100755
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,6 @@
     "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
-    "spacy.pipeline.multitask",
     "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
diff --git a/spacy/errors.py b/spacy/errors.py
index 5f03d0eae94..11b8980fd9d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -249,8 +249,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "https://spacy.io/usage/models")
     E011 = ("Unknown operator: '{op}'. Options: {opts}")
     E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
-    E016 = ("MultitaskObjective target should be function or one of: dep, "
-            "tag, ent, dep_tag_offset, ent_tag.")
     E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
     E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
             "refers to an issue with the `Vocab` or `StringStore`.")
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
deleted file mode 100644
index f33a90fde85..00000000000
--- a/spacy/pipeline/multitask.pyx
+++ /dev/null
@@ -1,215 +0,0 @@
-# cython: infer_types=True, binding=True
-from typing import Optional
-
-import numpy
-from thinc.api import Config, CosineDistance, Model, set_dropout_rate, to_categorical
-
-from ..attrs import ID
-from ..errors import Errors
-from ..language import Language
-from ..training import validate_examples
-from .tagger import Tagger
-from .trainable_pipe import TrainablePipe
-
-default_model_config = """
-[model]
-@architectures = "spacy.MultiTask.v1"
-maxout_pieces = 3
-token_vector_width = 96
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v2"
-pretrained_vectors = null
-width = 96
-depth = 4
-embed_size = 2000
-window_size = 1
-maxout_pieces = 2
-subword_features = true
-"""
-DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
-
-
-@Language.factory(
-    "nn_labeller",
-    default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
-)
-def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
-    return MultitaskObjective(nlp.vocab, model, name)
-
-
-class MultitaskObjective(Tagger):
-    """Experimental: Assist training of a parser or tagger, by training a
-    side-objective.
-    """
-
-    def __init__(self, vocab, model, name="nn_labeller", *, target):
-        self.vocab = vocab
-        self.model = model
-        self.name = name
-        if target == "dep":
-            self.make_label = self.make_dep
-        elif target == "tag":
-            self.make_label = self.make_tag
-        elif target == "ent":
-            self.make_label = self.make_ent
-        elif target == "dep_tag_offset":
-            self.make_label = self.make_dep_tag_offset
-        elif target == "ent_tag":
-            self.make_label = self.make_ent_tag
-        elif target == "sent_start":
-            self.make_label = self.make_sent_start
-        elif hasattr(target, "__call__"):
-            self.make_label = target
-        else:
-            raise ValueError(Errors.E016)
-        cfg = {"labels": {}, "target": target}
-        self.cfg = dict(cfg)
-
-    @property
-    def labels(self):
-        return self.cfg.setdefault("labels", {})
-
-    @labels.setter
-    def labels(self, value):
-        self.cfg["labels"] = value
-
-    def set_annotations(self, docs, dep_ids):
-        pass
-
-    def initialize(self, get_examples, nlp=None, labels=None):
-        if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
-            raise ValueError(err)
-        if labels is not None:
-            self.labels = labels
-        else:
-            for example in get_examples():
-                for token in example.y:
-                    label = self.make_label(token)
-                    if label is not None and label not in self.labels:
-                        self.labels[label] = len(self.labels)
-        self.model.initialize()   # TODO: fix initialization by defining X and Y
-
-    def predict(self, docs):
-        tokvecs = self.model.get_ref("tok2vec")(docs)
-        scores = self.model.get_ref("softmax")(tokvecs)
-        return tokvecs, scores
-
-    def get_loss(self, examples, scores):
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        for i, eg in enumerate(examples):
-            # Handles alignment for tokenization differences
-            _doc_annots = eg.get_aligned()  # TODO
-            for j in range(len(eg.predicted)):
-                tok_annots = {key: values[j] for key, values in tok_annots.items()}
-                label = self.make_label(j, tok_annots)
-                if label is None or label not in self.labels:
-                    correct[idx] = guesses[idx]
-                else:
-                    correct[idx] = self.labels[label]
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        loss = (d_scores**2).sum()
-        return float(loss), d_scores
-
-    @staticmethod
-    def make_dep(token):
-        return token.dep_
-
-    @staticmethod
-    def make_tag(token):
-        return token.tag_
-
-    @staticmethod
-    def make_ent(token):
-        if token.ent_iob_ == "O":
-            return "O"
-        else:
-            return token.ent_iob_ + "-" + token.ent_type_
-
-    @staticmethod
-    def make_dep_tag_offset(token):
-        dep = token.dep_
-        tag = token.tag_
-        offset = token.head.i - token.i
-        offset = min(offset, 2)
-        offset = max(offset, -2)
-        return f"{dep}-{tag}:{offset}"
-
-    @staticmethod
-    def make_ent_tag(token):
-        if token.ent_iob_ == "O":
-            ent = "O"
-        else:
-            ent = token.ent_iob_ + "-" + token.ent_type_
-        tag = token.tag_
-        return f"{tag}-{ent}"
-
-    @staticmethod
-    def make_sent_start(token):
-        """A multi-task objective for representing sentence boundaries,
-        using BILU scheme. (O is impossible)
-        """
-        if token.is_sent_start and token.is_sent_end:
-            return "U-SENT"
-        elif token.is_sent_start:
-            return "B-SENT"
-        else:
-            return "I-SENT"
-
-
-class ClozeMultitask(TrainablePipe):
-    def __init__(self, vocab, model, **cfg):
-        self.vocab = vocab
-        self.model = model
-        self.cfg = cfg
-        self.distance = CosineDistance(ignore_zeros=True, normalize=False)  # TODO: in config
-
-    def set_annotations(self, docs, dep_ids):
-        pass
-
-    def initialize(self, get_examples, nlp=None):
-        self.model.initialize()  # TODO: fix initialization by defining X and Y
-        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
-        self.model.output_layer.initialize(X)
-
-    def predict(self, docs):
-        tokvecs = self.model.get_ref("tok2vec")(docs)
-        vectors = self.model.get_ref("output_layer")(tokvecs)
-        return tokvecs, vectors
-
-    def get_loss(self, examples, vectors, prediction):
-        validate_examples(examples, "ClozeMultitask.get_loss")
-        # The simplest way to implement this would be to vstack the
-        # token.vector values, but that's a bit inefficient, especially on GPU.
-        # Instead we fetch the index into the vectors table for each of our tokens,
-        # and look them up all at once. This prevents data copying.
-        ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
-        target = vectors[ids]
-        gradient = self.distance.get_grad(prediction, target)
-        loss = self.distance.get_loss(prediction, target)
-        return float(loss), gradient
-
-    def update(self, examples, *, drop=0., sgd=None, losses=None):
-        pass
-
-    def rehearse(self, examples, drop=0., sgd=None, losses=None):
-        if losses is not None and self.name not in losses:
-            losses[self.name] = 0.
-        set_dropout_rate(self.model, drop)
-        validate_examples(examples, "ClozeMultitask.rehearse")
-        predictions, bp_predictions = self.model.begin_update()
-        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
-        bp_predictions(d_predictions)
-        if sgd is not None:
-            self.finish_update(sgd)
-        if losses is not None:
-            losses[self.name] += loss
-        return losses
-
-    def add_label(self, label):
-        raise NotImplementedError

From 637799f5fe57c4980cb4b9f6c25ae5b50b83bc52 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 12 Dec 2022 08:55:53 +0100
Subject: [PATCH 140/504] Custom extensions for spans with equal boundaries
 (#11429)

* Init

* Fix return type for mypy

* adjust types and improve setting new attributes

* Add underscore changes to json conversion

* Add test and underscore changes to from_docs

* add underscore changes and test to span.to_doc

* update return values

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Add types to function

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* adjust formatting

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* shorten return type

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* add helper function to improve readability

* Improve code and add comments

* rerun azure tests

* Fix tests for json conversion

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tests/doc/test_underscore.py | 119 +++++++++++++++++++++++++++++
 spacy/tokens/doc.pyx               |  30 ++++++--
 spacy/tokens/span.pyx              |  38 +++++++--
 spacy/tokens/underscore.py         |  44 ++++++++++-
 4 files changed, 214 insertions(+), 17 deletions(-)

diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index b79d2f01f41..ca5c2ad3959 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -4,6 +4,10 @@
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
+# Helper functions
+def _get_tuple(s: Span):
+    return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
+
 
 @pytest.fixture(scope="function", autouse=True)
 def clean_underscore():
@@ -172,3 +176,118 @@ def test_method(doc, arg1=1, arg2=2):
     doc = Doc(en_vocab, words=["hello", "world"])
     assert test_method.__doc__ == "I am a docstring"
     assert doc._.test_docstrings.__doc__.rsplit(". ")[-1] == "I am a docstring"
+
+
+def test_underscore_for_unique_span(en_tokenizer):
+    """Test that spans with the same boundaries but with different labels are uniquely identified (see #9706)."""
+    Doc.set_extension(name="doc_extension", default=None)
+    Span.set_extension(name="span_extension", default=None)
+    Token.set_extension(name="token_extension", default=None)
+
+    # Initialize doc
+    text = "Hello, world!"
+    doc = en_tokenizer(text)
+    span_1 = Span(doc, 0, 2, "SPAN_1")
+    span_2 = Span(doc, 0, 2, "SPAN_2")
+
+    # Set custom extensions
+    doc._.doc_extension = "doc extension"
+    doc[0]._.token_extension = "token extension"
+    span_1._.span_extension = "span_1 extension"
+    span_2._.span_extension = "span_2 extension"
+
+    # Assert extensions
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
+
+    # Change label of span and assert extensions
+    span_1.label_ = "NEW_LABEL"
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
+
+    # Change KB_ID and assert extensions
+    span_1.kb_id_ = "KB_ID"
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
+
+    # Change extensions and assert
+    span_2._.span_extension = "updated span_2 extension"
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
+
+    # Change span ID and assert extensions
+    span_2.id = 2
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
+
+    # Assert extensions with original key
+    assert doc.user_data[("._.", "doc_extension", None, None)] == "doc extension"
+    assert doc.user_data[("._.", "token_extension", 0, None)] == "token extension"
+
+
+def test_underscore_for_unique_span_from_docs(en_tokenizer):
+    """Test that spans in the user_data keep the same data structure when using Doc.from_docs"""
+    Span.set_extension(name="span_extension", default=None)
+    Token.set_extension(name="token_extension", default=None)
+
+    # Initialize doc
+    text_1 = "Hello, world!"
+    doc_1 = en_tokenizer(text_1)
+    span_1a = Span(doc_1, 0, 2, "SPAN_1a")
+    span_1b = Span(doc_1, 0, 2, "SPAN_1b")
+
+    text_2 = "This is a test."
+    doc_2 = en_tokenizer(text_2)
+    span_2a = Span(doc_2, 0, 3, "SPAN_2a")
+
+    # Set custom extensions
+    doc_1[0]._.token_extension = "token_1"
+    doc_2[1]._.token_extension = "token_2"
+    span_1a._.span_extension = "span_1a extension"
+    span_1b._.span_extension = "span_1b extension"
+    span_2a._.span_extension = "span_2a extension"
+
+    doc = Doc.from_docs([doc_1, doc_2])
+    # Assert extensions
+    assert doc_1.user_data[_get_tuple(span_1a)] == "span_1a extension"
+    assert doc_1.user_data[_get_tuple(span_1b)] == "span_1b extension"
+    assert doc_2.user_data[_get_tuple(span_2a)] == "span_2a extension"
+
+    # Check extensions on merged doc
+    assert doc.user_data[_get_tuple(span_1a)] == "span_1a extension"
+    assert doc.user_data[_get_tuple(span_1b)] == "span_1b extension"
+    assert (
+        doc.user_data[
+            (
+                "._.",
+                "span_extension",
+                span_2a.start_char + len(doc_1.text) + 1,
+                span_2a.end_char + len(doc_1.text) + 1,
+                span_2a.label,
+                span_2a.kb_id,
+                span_2a.id,
+            )
+        ]
+        == "span_2a extension"
+    )
+
+
+def test_underscore_for_unique_span_as_span(en_tokenizer):
+    """Test that spans in the user_data keep the same data structure when using Span.as_doc"""
+    Span.set_extension(name="span_extension", default=None)
+
+    # Initialize doc
+    text = "Hello, world!"
+    doc = en_tokenizer(text)
+    span_1 = Span(doc, 0, 2, "SPAN_1")
+    span_2 = Span(doc, 0, 2, "SPAN_2")
+
+    # Set custom extensions
+    span_1._.span_extension = "span_1 extension"
+    span_2._.span_extension = "span_2 extension"
+
+    span_doc = span_1.as_doc(copy_user_data=True)
+
+    # Assert extensions
+    assert span_doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert span_doc.user_data[_get_tuple(span_2)] == "span_2 extension"
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 48def8c9544..09dc94297f0 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1222,13 +1222,22 @@ cdef class Doc:
 
             if "user_data" not in exclude:
                 for key, value in doc.user_data.items():
-                    if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.":
-                        data_type, name, start, end = key
+                    if isinstance(key, tuple) and len(key) >= 4 and key[0] == "._.":
+                        data_type = key[0]
+                        name = key[1]
+                        start = key[2]
+                        end = key[3]
                         if start is not None or end is not None:
                             start += char_offset
                             if end is not None:
                                 end += char_offset
-                            concat_user_data[(data_type, name, start, end)] = copy.copy(value)
+                                _label = key[4]
+                                _kb_id = key[5]
+                                _span_id = key[6]
+                                concat_user_data[(data_type, name, start, end, _label, _kb_id, _span_id)] = copy.copy(value)
+                            else:
+                                concat_user_data[(data_type, name, start, end)] = copy.copy(value)
+
                         else:
                             warnings.warn(Warnings.W101.format(name=name))
                     else:
@@ -1672,7 +1681,11 @@ cdef class Doc:
                 Span.set_extension(span_attr)
             for span_data in doc_json["underscore_span"][span_attr]:
                 value = span_data["value"]
-                self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value)
+                span = self.char_span(span_data["start"], span_data["end"])
+                span.label = span_data["label"]
+                span.kb_id = span_data["kb_id"]
+                span.id = span_data["id"]
+                span._.set(span_attr, value)
         return self
 
     def to_json(self, underscore=None):
@@ -1750,13 +1763,16 @@ cdef class Doc:
                                 if attr not in data["underscore_token"]:
                                     data["underscore_token"][attr] = []
                                 data["underscore_token"][attr].append({"start": start, "value": value})
-                            # Span attribute
-                            elif start is not None and end is not None:
+                            # Else span attribute
+                            elif end is not None:
+                                _label = data_key[4]
+                                _kb_id = data_key[5]
+                                _span_id = data_key[6]
                                 if "underscore_span" not in data:
                                     data["underscore_span"] = {}
                                 if attr not in data["underscore_span"]:
                                     data["underscore_span"][attr] = []
-                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value})
+                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id":_span_id})
 
             for attr in underscore:
                 if attr not in user_keys:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 73f555747eb..bf37f955d98 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -230,11 +230,10 @@ cdef class Span:
         cdef SpanC* span_c = self.span_c()
         """Custom extension attributes registered via `set_extension`."""
         return Underscore(Underscore.span_extensions, self,
-                          start=span_c.start_char, end=span_c.end_char)
+                          start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
 
     def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None):
         """Create a `Doc` object with a copy of the `Span`'s data.
-
         copy_user_data (bool): Whether or not to copy the original doc's user data.
         array_head (tuple): `Doc` array attrs, can be passed in to speed up computation.
         array (ndarray): `Doc` as array, can be passed in to speed up computation.
@@ -287,12 +286,22 @@ cdef class Span:
             char_offset = self.start_char
             for key, value in self.doc.user_data.items():
                 if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.":
-                    data_type, name, start, end = key
+                    data_type = key[0]
+                    name = key[1]
+                    start = key[2]
+                    end = key[3]
                     if start is not None or end is not None:
                         start -= char_offset
+                        # Check if Span object
                         if end is not None:
                             end -= char_offset
-                        user_data[(data_type, name, start, end)] = copy.copy(value)
+                            _label = key[4]
+                            _kb_id = key[5]
+                            _span_id = key[6]
+                            user_data[(data_type, name, start, end, _label, _kb_id, _span_id)] = copy.copy(value)
+                        # Else Token object
+                        else:
+                            user_data[(data_type, name, start, end)] = copy.copy(value)
                 else:
                     user_data[key] = copy.copy(value)
             doc.user_data = user_data
@@ -815,21 +824,36 @@ cdef class Span:
             return self.span_c().label
 
         def __set__(self, attr_t label):
-            self.span_c().label = label
+            if label != self.span_c().label :
+                old_label = self.span_c().label
+                self.span_c().label = label
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id)
+                Underscore._replace_keys(old, new)
 
     property kb_id:
         def __get__(self):
             return self.span_c().kb_id
 
         def __set__(self, attr_t kb_id):
-            self.span_c().kb_id = kb_id
+            if kb_id != self.span_c().kb_id :
+                old_kb_id = self.span_c().kb_id
+                self.span_c().kb_id = kb_id
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id)
+                Underscore._replace_keys(old, new)
 
     property id:
         def __get__(self):
             return self.span_c().id
 
         def __set__(self, attr_t id):
-            self.span_c().id = id
+            if id != self.span_c().id :
+                old_id = self.span_c().id
+                self.span_c().id = id
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id)
+                Underscore._replace_keys(old, new)
 
     property ent_id:
         """Alias for the span's ID."""
diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index 0aa0c1e6d40..63706851286 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -3,10 +3,10 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 from ..errors import Errors
+from .span import Span
 
 if TYPE_CHECKING:
     from .doc import Doc
-    from .span import Span
     from .token import Token
 
 
@@ -26,6 +26,9 @@ def __init__(
         obj: Union["Doc", "Span", "Token"],
         start: Optional[int] = None,
         end: Optional[int] = None,
+        label: int = 0,
+        kb_id: int = 0,
+        span_id: int = 0,
     ):
         object.__setattr__(self, "_extensions", extensions)
         object.__setattr__(self, "_obj", obj)
@@ -37,6 +40,10 @@ def __init__(
         object.__setattr__(self, "_doc", obj.doc)
         object.__setattr__(self, "_start", start)
         object.__setattr__(self, "_end", end)
+        if type(obj) == Span:
+            object.__setattr__(self, "_label", label)
+            object.__setattr__(self, "_kb_id", kb_id)
+            object.__setattr__(self, "_span_id", span_id)
 
     def __dir__(self) -> List[str]:
         # Hack to enable autocomplete on custom extensions
@@ -89,8 +96,39 @@ def get(self, name: str) -> Any:
     def has(self, name: str) -> bool:
         return name in self._extensions
 
-    def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]:
-        return ("._.", name, self._start, self._end)
+    def _get_key(
+        self, name: str
+    ) -> Union[
+        Tuple[str, str, Optional[int], Optional[int]],
+        Tuple[str, str, Optional[int], Optional[int], int, int, int],
+    ]:
+        if hasattr(self, "_label"):
+            return (
+                "._.",
+                name,
+                self._start,
+                self._end,
+                self._label,
+                self._kb_id,
+                self._span_id,
+            )
+        else:
+            return "._.", name, self._start, self._end
+
+    @staticmethod
+    def _replace_keys(old_underscore: "Underscore", new_underscore: "Underscore"):
+        """
+        This function is called by Span when its kb_id or label are re-assigned.
+        It checks if any user_data is stored for this span and replaces the keys
+        """
+        for name in old_underscore._extensions:
+            old_key = old_underscore._get_key(name)
+            old_doc = old_underscore._doc
+            new_key = new_underscore._get_key(name)
+            if old_key != new_key and old_key in old_doc.user_data:
+                old_underscore._doc.user_data[
+                    new_key
+                ] = old_underscore._doc.user_data.pop(old_key)
 
     @classmethod
     def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]:

From d127dceec743c1e0646d99ebeacf3244c8ae80d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Sat, 17 Dec 2022 14:32:19 +0100
Subject: [PATCH 141/504] Fix v4 branch to build against Thinc v9 (#11921)

* Move `thinc.extra.search` to `spacy.pipeline._parser_internals`

Backport of:
https://github.com/explosion/spaCy/pull/11317

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Replace references to `thinc.backends.linalg` with `CBlas`

Backport of:
https://github.com/explosion/spaCy/pull/11292

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Use cross entropy from `thinc.legacy`

* Require thinc>=9.0.0.dev0,<9.1.0

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 pyproject.toml                                |   5 +-
 requirements.txt                              |   2 +-
 setup.cfg                                     |   4 +-
 setup.py                                      |   2 +
 spacy/ml/parser_model.pyx                     |  26 +-
 .../_parser_internals/_beam_utils.pxd         |   3 +-
 .../_parser_internals/_beam_utils.pyx         |  12 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 +-
 spacy/pipeline/_parser_internals/ner.pyx      |   4 +-
 spacy/pipeline/_parser_internals/search.pxd   |  89 +++++
 spacy/pipeline/_parser_internals/search.pyx   | 306 ++++++++++++++++++
 spacy/pipeline/edit_tree_lemmatizer.py        |   7 +-
 spacy/pipeline/morphologizer.pyx              |   5 +-
 spacy/pipeline/senter.pyx                     |   6 +-
 spacy/pipeline/tagger.pyx                     |   7 +-
 spacy/pipeline/transition_parser.pyx          |  21 +-
 spacy/tests/conftest.py                       |  32 ++
 spacy/tests/parser/_search.pyx                | 119 +++++++
 spacy/tests/parser/test_search.py             |   3 +
 19 files changed, 606 insertions(+), 50 deletions(-)
 create mode 100644 spacy/pipeline/_parser_internals/search.pxd
 create mode 100644 spacy/pipeline/_parser_internals/search.pyx
 create mode 100644 spacy/tests/parser/_search.pyx
 create mode 100644 spacy/tests/parser/test_search.py

diff --git a/pyproject.toml b/pyproject.toml
index bfd7e68d1f7..77e1471426f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,9 +5,8 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.2.2,<8.3.0",
-    "numpy>=1.15.0; python_version < '3.9'",
-    "numpy>=1.25.0; python_version >= '3.9'",
+    "thinc>=9.0.0.dev0,<9.1.0",
+    "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index 0ad05c62944..874f0826613 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.2.2,<8.3.0
+thinc>=9.0.0.dev0,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 935ac7d0ee4..5b5c330deb5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,8 +38,8 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.2.2,<8.3.0
-    wasabi>=0.9.1,<1.2.0
+    thinc>=9.0.0.dev0,<9.1.0
+    wasabi>=0.9.1,<1.1.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
     weasel>=0.1.0,<0.4.0
diff --git a/setup.py b/setup.py
index a80016ea9ea..0eb529c2098 100755
--- a/setup.py
+++ b/setup.py
@@ -47,6 +47,7 @@
     "spacy.pipeline._parser_internals.arc_eager",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
+    "spacy.pipeline._parser_internals.search",
     "spacy.pipeline._parser_internals._state",
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
@@ -66,6 +67,7 @@
     "spacy.matcher.dependencymatcher",
     "spacy.symbols",
     "spacy.vectors",
+    "spacy.tests.parser._search",
 ]
 COMPILE_OPTIONS = {
     "msvc": ["/Ox", "/EHsc"],
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index cb323e98891..10a9f0bc485 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -3,7 +3,6 @@
 cimport numpy as np
 from libc.math cimport exp
 from libc.stdlib cimport calloc, free, realloc
-from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport saxpy, sgemm
 from thinc.backends.linalg cimport Vec, VecVec
 
@@ -116,14 +115,10 @@ cdef void predict_states(
         n.hiddens * n.pieces
     )
     for i in range(n.states):
-        VecVec.add_i(
-            &A.unmaxed[i*n.hiddens*n.pieces],
-            W.feat_bias, 1.,
-            n.hiddens * n.pieces
-        )
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
         for j in range(n.hiddens):
             index = i * n.hiddens * n.pieces + j * n.pieces
-            which = Vec.arg_max(&A.unmaxed[index], n.pieces)
+            which = _arg_max(&A.unmaxed[index], n.pieces)
             A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
     memset(A.scores, 0, n.states * n.classes * sizeof(float))
     if W.hidden_weights == NULL:
@@ -138,7 +133,7 @@ cdef void predict_states(
         )
         # Add bias
         for i in range(n.states):
-            VecVec.add_i(&A.scores[i*n.classes], W.hidden_bias, 1., n.classes)
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
     # Set unseen classes to minimum value
     i = 0
     min_ = A.scores[0]
@@ -187,7 +182,8 @@ cdef void cpu_log_loss(
     """Do multi-label log loss"""
     cdef double max_, gmax, Z, gZ
     best = arg_max_if_gold(scores, costs, is_valid, O)
-    guess = Vec.arg_max(scores, O)
+    guess = _arg_max(scores, O)
+
     if best == -1 or guess == -1:
         # These shouldn't happen, but if they do, we want to make sure we don't
         # cause an OOB access.
@@ -529,3 +525,15 @@ cdef class precompute_hiddens:
             return d_best.reshape((d_best.shape + (1,)))
 
         return state_vector, backprop_relu
+
+cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pxd b/spacy/pipeline/_parser_internals/_beam_utils.pxd
index 596306b2319..571f246b1e3 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd
@@ -1,7 +1,6 @@
 from ...typedefs cimport class_t, hash_t
 
-
-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
 
 cdef int check_final_state(void* _state, void* extra_args) except -1
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index ac04be5a719..d004d313c3e 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -1,21 +1,17 @@
 # cython: infer_types=True
 import numpy
-
-from thinc.extra.search cimport Beam
-
-from thinc.extra.search import MaxViolation
-
-from thinc.extra.search cimport MaxViolation
+from cpython.ref cimport PyObject, Py_XDECREF
 
 from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
-
+from .search cimport Beam, MaxViolation
+from .search import MaxViolation
 from .stateclass cimport StateC, StateClass
 
 
-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
     dest = <StateC*>_dest
     src = <StateC*>_src
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index e1375494482..10f2649baa0 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -18,8 +18,7 @@ from ._state cimport ArcC, StateC
 from .stateclass cimport StateClass
 
 from ...errors import Errors
-
-from thinc.extra.search cimport Beam
+from .search cimport Beam
 
 
 cdef weight_t MIN_SCORE = -90000
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index c77b7b50f2d..6851f9f2096 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -8,8 +8,6 @@ from libc.stdint cimport int32_t
 
 from collections import Counter
 
-from thinc.extra.search cimport Beam
-
 from ...tokens.doc cimport Doc
 
 from ...tokens.span import Span
@@ -23,6 +21,8 @@ from ...typedefs cimport attr_t, weight_t
 from ...training import split_bilu_label
 
 from ...training.example cimport Example
+from .search cimport Beam
+from .stateclass cimport StateClass
 from ._state cimport StateC
 from .stateclass cimport StateClass
 from .transition_system cimport Transition, do_func_t
diff --git a/spacy/pipeline/_parser_internals/search.pxd b/spacy/pipeline/_parser_internals/search.pxd
new file mode 100644
index 00000000000..dfe30e1c130
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/search.pxd
@@ -0,0 +1,89 @@
+from cymem.cymem cimport Pool
+
+from libc.stdint cimport uint32_t
+from libc.stdint cimport uint64_t
+from libcpp.pair cimport pair
+from libcpp.queue cimport priority_queue
+from libcpp.vector cimport vector
+
+from ...typedefs cimport class_t, weight_t, hash_t
+
+ctypedef pair[weight_t, size_t] Entry
+ctypedef priority_queue[Entry] Queue
+
+
+ctypedef int (*trans_func_t)(void* dest, void* src, class_t clas, void* x) except -1
+
+ctypedef void* (*init_func_t)(Pool mem, int n, void* extra_args) except NULL
+
+ctypedef int (*del_func_t)(Pool mem, void* state, void* extra_args) except -1
+
+ctypedef int (*finish_func_t)(void* state, void* extra_args) except -1
+
+ctypedef hash_t (*hash_func_t)(void* state, void* x) except 0
+
+
+cdef struct _State:
+    void* content
+    class_t* hist
+    weight_t score
+    weight_t loss
+    int i
+    int t
+    bint is_done
+
+
+cdef class Beam:
+    cdef Pool mem
+    cdef class_t nr_class
+    cdef class_t width
+    cdef class_t size
+    cdef public weight_t min_density
+    cdef int t
+    cdef readonly bint is_done
+    cdef list histories
+    cdef list _parent_histories
+    cdef weight_t** scores
+    cdef int** is_valid
+    cdef weight_t** costs
+    cdef _State* _parents
+    cdef _State* _states
+    cdef del_func_t del_func
+
+    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1
+
+    cdef inline void* at(self, int i) nogil:
+        return self._states[i].content
+
+    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1
+    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
+                     void* extra_args) except -1
+    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
+ 
+
+    cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
+        self.scores[i][j] = score
+        self.is_valid[i][j] = is_valid
+        self.costs[i][j] = cost
+
+    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
+                     const weight_t* costs) except -1
+    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1
+
+
+cdef class MaxViolation:
+    cdef Pool mem
+    cdef weight_t cost
+    cdef weight_t delta
+    cdef readonly weight_t p_score
+    cdef readonly weight_t g_score
+    cdef readonly double Z
+    cdef readonly double gZ
+    cdef class_t n
+    cdef readonly list p_hist
+    cdef readonly list g_hist
+    cdef readonly list p_probs
+    cdef readonly list g_probs
+
+    cpdef int check(self, Beam pred, Beam gold) except -1
+    cpdef int check_crf(self, Beam pred, Beam gold) except -1
diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
new file mode 100644
index 00000000000..1d9b6dd7adf
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -0,0 +1,306 @@
+# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
+cimport cython
+from libc.string cimport memset, memcpy
+from libc.math cimport log, exp
+import math
+
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+
+
+cdef class Beam:
+    def __init__(self, class_t nr_class, class_t width, weight_t min_density=0.0):
+        assert nr_class != 0
+        assert width != 0
+        self.nr_class = nr_class
+        self.width = width
+        self.min_density = min_density
+        self.size = 1
+        self.t = 0
+        self.mem = Pool()
+        self.del_func = NULL
+        self._parents = <_State*>self.mem.alloc(self.width, sizeof(_State))
+        self._states = <_State*>self.mem.alloc(self.width, sizeof(_State))
+        cdef int i
+        self.histories = [[] for i in range(self.width)]
+        self._parent_histories = [[] for i in range(self.width)]
+
+        self.scores = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
+        self.is_valid = <int**>self.mem.alloc(self.width, sizeof(weight_t*))
+        self.costs = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
+        for i in range(self.width):
+            self.scores[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
+            self.is_valid[i] = <int*>self.mem.alloc(self.nr_class, sizeof(int))
+            self.costs[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
+
+    def __len__(self):
+        return self.size
+
+    property score:
+        def __get__(self):
+            return self._states[0].score
+
+    property min_score:
+        def __get__(self):
+            return self._states[self.size-1].score
+
+    property loss:
+        def __get__(self):
+            return self._states[0].loss
+
+    property probs:
+        def __get__(self):
+            return _softmax([self._states[i].score for i in range(self.size)])
+
+    property scores:
+        def __get__(self):
+            return [self._states[i].score for i in range(self.size)]
+
+    property histories:
+        def __get__(self):
+            return self.histories
+
+    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
+                     const weight_t* costs) except -1:
+        cdef int j
+        for j in range(self.nr_class):
+            self.scores[i][j] = scores[j]
+            self.is_valid[i][j] = is_valid[j]
+            self.costs[i][j] = costs[j]
+
+    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
+        cdef int i, j
+        for i in range(self.width):
+            memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
+            memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
+            memcpy(self.costs[i], costs[i], sizeof(int) * self.nr_class)
+
+    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1:
+        for i in range(self.width):
+            self._states[i].content = init_func(self.mem, n, extra_args)
+            self._parents[i].content = init_func(self.mem, n, extra_args)
+        self.del_func = del_func
+
+    def __dealloc__(self):
+        if self.del_func == NULL:
+            return
+
+        for i in range(self.width):
+            self.del_func(self.mem, self._states[i].content, NULL)
+            self.del_func(self.mem, self._parents[i].content, NULL)
+
+    @cython.cdivision(True)
+    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
+                     void* extra_args) except -1:
+        cdef weight_t** scores = self.scores
+        cdef int** is_valid = self.is_valid
+        cdef weight_t** costs = self.costs
+
+        cdef Queue* q = new Queue()
+        self._fill(q, scores, is_valid)
+        # For a beam of width k, we only ever need 2k state objects. How?
+        # Each transition takes a parent and a class and produces a new state.
+        # So, we don't need the whole history --- just the parent. So at
+        # each step, we take a parent, and apply one or more extensions to
+        # it.
+        self._parents, self._states = self._states, self._parents
+        self._parent_histories, self.histories = self.histories, self._parent_histories
+        cdef weight_t score
+        cdef int p_i
+        cdef int i = 0
+        cdef class_t clas
+        cdef _State* parent
+        cdef _State* state
+        cdef hash_t key
+        cdef PreshMap seen_states = PreshMap(self.width)
+        cdef uint64_t is_seen
+        cdef uint64_t one = 1
+        while i < self.width and not q.empty():
+            data = q.top()
+            p_i = data.second / self.nr_class
+            clas = data.second % self.nr_class
+            score = data.first
+            q.pop()
+            parent = &self._parents[p_i]
+            # Indicates terminal state reached; i.e. state is done
+            if parent.is_done:
+                # Now parent will not be changed, so we don't have to copy.
+                # Once finished, should also be unbranching.
+                self._states[i], parent[0] = parent[0], self._states[i]
+                parent.i = self._states[i].i
+                parent.t = self._states[i].t
+                parent.is_done = self._states[i].t
+                self._states[i].score = score
+                self.histories[i] = list(self._parent_histories[p_i])
+                i += 1
+            else:
+                state = &self._states[i]
+                # The supplied transition function should adjust the destination
+                # state to be the result of applying the class to the source state
+                transition_func(state.content, parent.content, clas, extra_args)
+                key = hash_func(state.content, extra_args) if hash_func is not NULL else 0
+                is_seen = <uint64_t>seen_states.get(key)
+                if key == 0 or key == 1 or not is_seen:
+                    if key != 0 and key != 1:
+                        seen_states.set(key, <void*>one)
+                    state.score = score
+                    state.loss = parent.loss + costs[p_i][clas]
+                    self.histories[i] = list(self._parent_histories[p_i])
+                    self.histories[i].append(clas)
+                    i += 1
+        del q
+        self.size = i
+        assert self.size >= 1
+        for i in range(self.width):
+            memset(self.scores[i], 0, sizeof(weight_t) * self.nr_class)
+            memset(self.costs[i], 0, sizeof(weight_t) * self.nr_class)
+            memset(self.is_valid[i], 0, sizeof(int) * self.nr_class)
+        self.t += 1
+
+    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1:
+        cdef int i
+        for i in range(self.size):
+            if not self._states[i].is_done:
+                self._states[i].is_done = finish_func(self._states[i].content, extra_args)
+        for i in range(self.size):
+            if not self._states[i].is_done:
+                self.is_done = False
+                break
+        else:
+            self.is_done = True
+
+    @cython.cdivision(True)
+    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1:
+        """Populate the queue from a k * n matrix of scores, where k is the
+        beam-width, and n is the number of classes.
+        """
+        cdef Entry entry
+        cdef weight_t score
+        cdef _State* s
+        cdef int i, j, move_id
+        assert self.size >= 1
+        cdef vector[Entry] entries
+        for i in range(self.size):
+            s = &self._states[i]
+            move_id = i * self.nr_class
+            if s.is_done:
+                # Update score by path average, following TACL '13 paper.
+                if self.histories[i]:
+                    entry.first = s.score + (s.score / self.t)
+                else:
+                    entry.first = s.score
+                entry.second = move_id
+                entries.push_back(entry)
+            else:
+                for j in range(self.nr_class):
+                    if is_valid[i][j]:
+                        entry.first = s.score + scores[i][j]
+                        entry.second = move_id + j
+                        entries.push_back(entry)
+        cdef double max_, Z, cutoff
+        if self.min_density == 0.0:
+            for i in range(entries.size()):
+                q.push(entries[i])
+        elif not entries.empty():
+            max_ = entries[0].first
+            Z = 0.
+            cutoff = 0.
+            # Softmax into probabilities, so we can prune
+            for i in range(entries.size()):
+                if entries[i].first > max_:
+                    max_ = entries[i].first
+            for i in range(entries.size()):
+                Z += exp(entries[i].first-max_)
+            cutoff = (1. / Z) * self.min_density
+            for i in range(entries.size()):
+                prob = exp(entries[i].first-max_) / Z
+                if prob >= cutoff:
+                    q.push(entries[i])
+
+
+cdef class MaxViolation:
+    def __init__(self):
+        self.p_score = 0.0
+        self.g_score = 0.0
+        self.Z = 0.0
+        self.gZ = 0.0
+        self.delta = -1
+        self.cost = 0
+        self.p_hist = []
+        self.g_hist = []
+        self.p_probs = []
+        self.g_probs = []
+
+    cpdef int check(self, Beam pred, Beam gold) except -1:
+        cdef _State* p = &pred._states[0]
+        cdef _State* g = &gold._states[0]
+        cdef weight_t d = p.score - g.score
+        if p.loss >= 1 and (self.cost == 0 or d > self.delta):
+            self.cost = p.loss
+            self.delta = d
+            self.p_hist = list(pred.histories[0])
+            self.g_hist = list(gold.histories[0])
+            self.p_score = p.score
+            self.g_score = g.score
+            self.Z = 1e-10
+            self.gZ = 1e-10
+            for i in range(pred.size):
+                if pred._states[i].loss > 0:
+                    self.Z += exp(pred._states[i].score)
+            for i in range(gold.size):
+                if gold._states[i].loss == 0:
+                    prob = exp(gold._states[i].score)
+                    self.Z += prob
+                    self.gZ += prob
+
+    cpdef int check_crf(self, Beam pred, Beam gold) except -1:
+        d = pred.score - gold.score
+        seen_golds = set([tuple(gold.histories[i]) for i in range(gold.size)])
+        if pred.loss > 0 and (self.cost == 0 or d > self.delta):
+            p_hist = []
+            p_scores = []
+            g_hist = []
+            g_scores = []
+            for i in range(pred.size):
+                if pred._states[i].loss > 0:
+                    p_scores.append(pred._states[i].score)
+                    p_hist.append(list(pred.histories[i]))
+                # This can happen from non-monotonic actions
+                # If we find a better gold analysis this way, be sure to keep it.
+                elif pred._states[i].loss <= 0 \
+                and tuple(pred.histories[i]) not in seen_golds:
+                    g_scores.append(pred._states[i].score)
+                    g_hist.append(list(pred.histories[i]))
+            for i in range(gold.size):
+                if gold._states[i].loss == 0:
+                    g_scores.append(gold._states[i].score)
+                    g_hist.append(list(gold.histories[i]))
+
+            all_probs = _softmax(p_scores + g_scores)
+            p_probs = all_probs[:len(p_scores)]
+            g_probs_all = all_probs[len(p_scores):]
+            g_probs = _softmax(g_scores)
+
+            self.cost = pred.loss
+            self.delta = d
+            self.p_hist = p_hist
+            self.g_hist = g_hist
+            # TODO: These variables are misnamed! These are the gradients of the loss.
+            self.p_probs = p_probs
+            # Intuition here:
+            # The gradient of the loss is:
+            # P(model) - P(truth)
+            # Normally, P(truth) is 1 for the gold
+            # But, if we want to do the "partial credit" scheme, we want
+            # to create a distribution over the gold, proportional to the scores
+            # awarded.
+            self.g_probs = [x-y for x, y in zip(g_probs_all, g_probs)]
+
+
+def _softmax(nums):
+    if not nums:
+        return []
+    max_ = max(nums)
+    nums = [(exp(n-max_) if n is not None else None) for n in nums]
+    Z = sum(n for n in nums if n is not None)
+    return [(n/Z if n is not None else None) for n in nums]
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 2ef639cad52..f9a8ae10561 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -4,8 +4,9 @@
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+from thinc.api import Config, Model
 from thinc.types import ArrayXd, Floats2d, Ints1d
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 
 from .. import util
 from ..errors import Errors
@@ -131,7 +132,9 @@ def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
     ) -> Tuple[float, List[Floats2d]]:
         validate_examples(examples, "EditTreeLemmatizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
+        loss_func = LegacySequenceCategoricalCrossentropy(
+            normalize=False, missing_value=-1
+        )
 
         truths = []
         for eg in examples:
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index cc8f87936b9..d3068bdffdd 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,7 +1,8 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Callable, Dict, Iterable, List, Optional, Union
 import srsly
-from thinc.api import SequenceCategoricalCrossentropy, Model, Config
+from thinc.api import Model, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 from itertools import islice
 from typing import Callable, Dict, Optional, Union
@@ -302,7 +303,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 521afe1d181..185430c122c 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -4,7 +4,9 @@ from itertools import islice
 from typing import Callable, Optional
 
 import srsly
-from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.api import Model, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+
 from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
@@ -163,7 +165,7 @@ class SentenceRecognizer(Tagger):
         """
         validate_examples(examples, "SentenceRecognizer.get_loss")
         labels = self.labels
-        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=labels, normalize=False)
         truths = []
         for eg in examples:
             eg_truth = []
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 8ecd0c46ee0..f25ee00407b 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -2,7 +2,8 @@
 from typing import Callable, Dict, Iterable, List, Optional, Union
 import numpy
 import srsly
-from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.api import Model, set_dropout_rate, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 import warnings
 from itertools import islice
@@ -242,7 +243,7 @@ class Tagger(TrainablePipe):
 
         DOCS: https://spacy.io/api/tagger#rehearse
         """
-        loss_func = SequenceCategoricalCrossentropy()
+        loss_func = LegacySequenceCategoricalCrossentropy()
         if losses is None:
             losses = {}
         losses.setdefault(self.name, 0.0)
@@ -273,7 +274,7 @@ class Tagger(TrainablePipe):
         DOCS: https://spacy.io/api/tagger#get_loss
         """
         validate_examples(examples, "Tagger.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"], label_smoothing=self.cfg["label_smoothing"])
+        loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
         # Convert empty tag "" to missing value None so that both misaligned
         # tokens and tokens with missing annotation have the default missing
         # value None.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index b8ebbf8ca88..d310df92151 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -13,23 +13,20 @@ from libcpp.vector cimport vector
 
 import random
 
+import srsly
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
+import numpy.random
 import numpy
 import numpy.random
 import srsly
 from thinc.api import CupyOps, NumpyOps, set_dropout_rate
 
-from ..ml.parser_model cimport (
-    ActivationsC,
-    SizesC,
-    WeightsC,
-    alloc_activations,
-    arg_max_if_valid,
-    cpu_log_loss,
-    free_activations,
-    get_c_sizes,
-    get_c_weights,
-    predict_states,
-)
+from ._parser_internals.stateclass cimport StateClass
+from ._parser_internals.search cimport Beam
+from ..ml.parser_model cimport alloc_activations, free_activations
+from ..ml.parser_model cimport predict_states, arg_max_if_valid
+from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
+from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
 from ._parser_internals.stateclass cimport StateClass
 
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 2a9f441c9b0..6085b89cf02 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -1,5 +1,10 @@
 import pytest
+from spacy.util import get_lang_class
+import functools
 from hypothesis import settings
+import inspect
+import importlib
+import sys
 
 from spacy.util import get_lang_class
 
@@ -48,6 +53,33 @@ def getopt(opt):
             pytest.skip("not referencing any issues")
 
 
+# Decorator for Cython-built tests
+# https://shwina.github.io/cython-testing/
+def cytest(func):
+    """
+    Wraps `func` in a plain Python function.
+    """
+
+    @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+        bound = inspect.signature(func).bind(*args, **kwargs)
+        return func(*bound.args, **bound.kwargs)
+
+    return wrapped
+
+
+def register_cython_tests(cython_mod_name: str, test_mod_name: str):
+    """
+    Registers all callables with name `test_*` in Cython module `cython_mod_name`
+    as attributes in module `test_mod_name`, making them discoverable by pytest.
+    """
+    cython_mod = importlib.import_module(cython_mod_name)
+    for name in dir(cython_mod):
+        item = getattr(cython_mod, name)
+        if callable(item) and name.startswith("test_"):
+            setattr(sys.modules[test_mod_name], name, item)
+
+
 # Fixtures for language tokenizers (languages sorted alphabetically)
 
 
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
new file mode 100644
index 00000000000..23fc8164412
--- /dev/null
+++ b/spacy/tests/parser/_search.pyx
@@ -0,0 +1,119 @@
+# cython: infer_types=True, binding=True
+from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
+from spacy.typedefs cimport class_t, weight_t
+from cymem.cymem cimport Pool
+
+from ..conftest import cytest
+import pytest
+
+cdef struct TestState:
+    int length
+    int x
+    Py_UNICODE* string
+
+
+cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1:
+    dest_state = <TestState*>dest
+    src_state = <TestState*>src
+    dest_state.length = src_state.length
+    dest_state.x = src_state.x
+    dest_state.x += clas
+    if extra_args != NULL:
+        dest_state.string = <Py_UNICODE*>extra_args
+    else:
+        dest_state.string = src_state.string
+
+
+cdef void* initialize(Pool mem, int n, void* extra_args) except NULL:
+    state = <TestState*>mem.alloc(1, sizeof(TestState))
+    state.length = n
+    state.x = 1
+    if extra_args == NULL:
+        state.string = u'default'
+    else:
+        state.string = <Py_UNICODE*>extra_args
+    return state
+
+
+cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
+    state = <TestState*>state
+    mem.free(state)
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width",
+    [
+        (2, 3),
+        (3, 6),
+        (4, 20),
+    ]
+)
+def test_init(nr_class, beam_width):
+    b = Beam(nr_class, beam_width)
+    assert b.size == 1
+    assert b.width == beam_width
+    assert b.nr_class == nr_class
+
+@cytest
+def test_init_violn():
+    MaxViolation()
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length",
+    [
+        (2, 3, 3),
+        (3, 6, 15),
+        (4, 20, 32),
+    ]
+)
+def test_initialize(nr_class, beam_width, length):
+    b = Beam(nr_class, beam_width)
+    b.initialize(initialize, destroy, length, NULL)
+    for i in range(b.width):
+        s = <TestState*>b.at(i)
+        assert s.length == length, s.length
+        assert s.string == 'default'
+
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length,extra",
+    [
+        (2, 3, 4, None),
+        (3, 6, 15, u"test beam 1"),
+    ]
+)
+def test_initialize_extra(nr_class, beam_width, length, extra):
+    b = Beam(nr_class, beam_width)
+    if extra is None:
+        b.initialize(initialize, destroy, length, NULL)
+    else:
+        b.initialize(initialize, destroy, length, <void*><Py_UNICODE*>extra)
+    for i in range(b.width):
+        s = <TestState*>b.at(i)
+        assert s.length == length
+
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length",
+    [
+        (3, 6, 15),
+        (4, 20, 32),
+    ]
+)
+def test_transition(nr_class, beam_width, length):
+    b = Beam(nr_class, beam_width)
+    b.initialize(initialize, destroy, length, NULL)
+    b.set_cell(0, 2, 30, True, 0)
+    b.set_cell(0, 1, 42, False, 0)
+    b.advance(transition, NULL, NULL)
+    assert b.size == 1, b.size
+    assert b.score == 30, b.score
+    s = <TestState*>b.at(0)
+    assert s.x == 3
+    assert b._states[0].score == 30, b._states[0].score
+    b.set_cell(0, 1, 10, True, 0)
+    b.set_cell(0, 2, 20, True, 0)
+    b.advance(transition, NULL, NULL)
+    assert b._states[0].score == 50, b._states[0].score
+    assert b._states[1].score == 40
+    s = <TestState*>b.at(0)
+    assert s.x == 5
diff --git a/spacy/tests/parser/test_search.py b/spacy/tests/parser/test_search.py
new file mode 100644
index 00000000000..136c3a11b8a
--- /dev/null
+++ b/spacy/tests/parser/test_search.py
@@ -0,0 +1,3 @@
+from ..conftest import register_cython_tests
+
+register_cython_tests("spacy.tests.parser._search", __name__)

From 9edf840865970839060fbb6021460c509bf1ada9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 22 Dec 2022 10:23:31 +0100
Subject: [PATCH 142/504] Fix fallout from a previous merge

---
 spacy/pipeline/textcat_multilabel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index ac024ba3639..9ed9770086c 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -163,6 +163,7 @@ def __init__(
         name (str): The component instance name, used to add entries to the
             losses during training.
         threshold (float): Cutoff to consider a prediction "positive".
+        scorer (Optional[Callable]): The scoring method.
         save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/textcategorizer#init

From cd7edad877902311e06b91c0dcd12068c4df4a4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 29 Dec 2022 08:03:24 +0100
Subject: [PATCH 143/504] Adjust to new `Schedule` class and pass scores to
 `Optimizer` (#12008)

* Adjust to new `Schedule` class and pass scores to `Optimizer`

Requires https://github.com/explosion/thinc/pull/804

* Bump minimum Thinc requirement to 9.0.0.dev1
---
 pyproject.toml             |  2 +-
 requirements.txt           |  2 +-
 setup.cfg                  |  4 ++--
 spacy/training/batchers.py | 38 ++++++++++++++++----------------------
 spacy/training/loop.py     |  3 ++-
 spacy/util.py              | 13 +++++++++----
 6 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 77e1471426f..58e4382e829 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev0,<9.1.0",
+    "thinc>=9.0.0.dev1,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 874f0826613..8b5cc51f875 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev0,<9.1.0
+thinc>=9.0.0.dev1,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 5b5c330deb5..613410c3347 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,8 +38,8 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev0,<9.1.0
-    wasabi>=0.9.1,<1.1.0
+    thinc>=9.0.0.dev1,<9.1.0
+    wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
     weasel>=0.1.0,<0.4.0
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index 050c3351b08..519e61315da 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,20 +1,9 @@
 import itertools
-from functools import partial
-from typing import (
-    Any,
-    Callable,
-    Iterable,
-    Iterator,
-    List,
-    Optional,
-    Sequence,
-    TypeVar,
-    Union,
-)
+from thinc.schedules import Schedule, constant as constant_schedule
 
 from ..util import minibatch, registry
 
-Sizing = Union[Sequence[int], int]
+Sizing = Union[Sequence[int], int, Schedule[int]]
 ItemT = TypeVar("ItemT")
 BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
 
@@ -119,12 +108,13 @@ def minibatch_by_padded_size(
         The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = itertools.repeat(size)  # type: Iterator[int]
+        size_ = constant_schedule(size)
     else:
-        size_ = iter(size)
-    for outer_batch in minibatch(seqs, size=buffer):
+        assert isinstance(size, Schedule)
+        size_ = size
+    for step, outer_batch in enumerate(minibatch(seqs, size=buffer)):
         outer_batch = list(outer_batch)
-        target_size = next(size_)
+        target_size = size_(step)
         for indices in _batch_by_length(outer_batch, target_size, get_length):
             subbatch = [outer_batch[i] for i in indices]
             padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
@@ -155,10 +145,12 @@ def minibatch_by_words(
         item. The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = itertools.repeat(size)  # type: Iterator[int]
+        size_ = constant_schedule(size)
     else:
-        size_ = iter(size)
-    target_size = next(size_)
+        assert isinstance(size, Schedule)
+        size_ = size
+    step = 0
+    target_size = size_(step)
     tol_size = target_size * tolerance
     batch = []
     overflow = []
@@ -183,7 +175,8 @@ def minibatch_by_words(
         else:
             if batch:
                 yield batch
-            target_size = next(size_)
+            step += 1
+            target_size = size_(step)
             tol_size = target_size * tolerance
             batch = overflow
             batch_size = overflow_size
@@ -201,7 +194,8 @@ def minibatch_by_words(
             else:
                 if batch:
                     yield batch
-                target_size = next(size_)
+                step += 1
+                target_size = size_(step)
                 tol_size = target_size * tolerance
                 batch = [seq]
                 batch_size = n_words
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 56df5395720..05c59fc9877 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -215,7 +215,7 @@ def train_while_improving(
         if before_update:
             before_update_args = {"step": step, "epoch": epoch}
             before_update(nlp, before_update_args)
-        dropout = next(dropouts)  # type: ignore
+        dropout = dropouts(optimizer.step)  # type: ignore
         for subbatch in subdivide_batch(batch, accumulate_gradient):
             nlp.update(
                 subbatch,
@@ -241,6 +241,7 @@ def train_while_improving(
                     score, other_scores = evaluate()
             else:
                 score, other_scores = evaluate()
+            optimizer.last_score = score
             results.append((score, step))
             is_best_checkpoint = score == max(results)[0]
         else:
diff --git a/spacy/util.py b/spacy/util.py
index 463ac219bf5..551f78cc969 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1,7 +1,12 @@
 import functools
 import importlib
 import importlib.util
-import inspect
+import re
+from pathlib import Path
+import thinc
+from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
+from thinc.api import ConfigValidationError, Model, constant as constant_schedule
+import functools
 import itertools
 import logging
 import os
@@ -1637,12 +1642,12 @@ def minibatch(items, size):
     so that batch-size can vary on each step.
     """
     if isinstance(size, int):
-        size_ = itertools.repeat(size)
+        size_ = constant_schedule(size)
     else:
         size_ = size
     items = iter(items)
-    while True:
-        batch_size = next(size_)
+    for step in itertools.count():
+        batch_size = size_(step)
         batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break

From d25ee66deeb98699f174b61c4f7ac747d63ef7a6 Mon Sep 17 00:00:00 2001
From: Tetsuo Kiso <tetsuok@users.noreply.github.com>
Date: Wed, 4 Jan 2023 01:43:09 +0900
Subject: [PATCH 144/504] Delete unused imports for StringStore (#12040)

---
 spacy/lexeme.pxd    | 18 ++++--------------
 spacy/tokenizer.pxd |  4 ++++
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index ff2e4f92edf..2d14edcd6b0 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,20 +1,10 @@
 from numpy cimport ndarray
 
-from .attrs cimport (
-    ID,
-    LANG,
-    LENGTH,
-    LOWER,
-    NORM,
-    ORTH,
-    PREFIX,
-    SHAPE,
-    SUFFIX,
-    attr_id_t,
-)
-from .strings cimport StringStore
+from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
+from .attrs cimport attr_id_t
+from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
+
 from .structs cimport LexemeC
-from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
 from .vocab cimport Vocab
 
 
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index f64e0e93413..c963dcbcfa4 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -2,6 +2,10 @@ from cymem.cymem cimport Pool
 from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 
+from .typedefs cimport hash_t
+from .structs cimport LexemeC, SpanC, TokenC
+from .tokens.doc cimport Doc
+from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
 from .strings cimport StringStore
 from .structs cimport LexemeC, SpanC, TokenC

From ce8489fe287c74b83a06a04f61809361012e3ccf Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Mon, 9 Jan 2023 20:15:02 +0100
Subject: [PATCH 145/504] Pass `step=0` to `Schedule` class to yield initial
 learning rate (#12078)

---
 spacy/training/loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 05c59fc9877..58d5b06786f 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -111,7 +111,7 @@ def save_checkpoint(is_best):
         stdout.write(
             msg.info(f"Set annotations on update for: {annotating_components}") + "\n"
         )
-    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
+    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate(step=0)}") + "\n")
     with nlp.select_pipes(disable=frozen_components):
         log_step, finalize_logger = train_logger(nlp, stdout, stderr)
     try:

From e353017aa9a1ffe9c8ce261de57ae2ed9d93fcd6 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 11 Jan 2023 18:57:50 +0100
Subject: [PATCH 146/504] update tests from master to follow v4 principles

---
 spacy/tests/pipeline/test_entity_ruler.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 9f5204006ec..ae57da5134c 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -357,9 +357,9 @@ def test_entity_ruler_overlapping_spans(nlp):
     assert doc.ents[0].label_ == "FOOBAR"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+@pytest.mark.parametrize()
+def test_entity_ruler_fuzzy_pipe(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
     ruler.add_patterns(patterns)
     doc = nlp("helloo")
@@ -367,9 +367,9 @@ def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fuzzy(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+@pytest.mark.parametrize()
+def test_entity_ruler_fuzzy(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
     ruler.add_patterns(patterns)
     doc = nlp("helloo")
@@ -377,15 +377,14 @@ def test_entity_ruler_fuzzy(nlp, entity_ruler_factory):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fuzzy_disabled(nlp, entity_ruler_factory):
+@pytest.mark.parametrize()
+def test_entity_ruler_fuzzy_disabled(nlp):
     @registry.misc("test_fuzzy_compare_disabled")
     def make_test_fuzzy_compare_disabled():
         return lambda x, y, z: False
 
     ruler = nlp.add_pipe(
-        entity_ruler_factory,
-        name="entity_ruler",
+        "entity_ruler",
         config={"matcher_fuzzy_compare": {"@misc": "test_fuzzy_compare_disabled"}},
     )
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]

From 9018b2a67a1e53fdbdac2f3e7f21062b8003723d Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 11 Jan 2023 19:04:06 +0100
Subject: [PATCH 147/504] update tests from master to follow v4 principles (2)

---
 spacy/tests/pipeline/test_entity_ruler.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index ae57da5134c..6bff3288dc3 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -357,7 +357,6 @@ def test_entity_ruler_overlapping_spans(nlp):
     assert doc.ents[0].label_ == "FOOBAR"
 
 
-@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy_pipe(nlp):
     ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
@@ -367,7 +366,6 @@ def test_entity_ruler_fuzzy_pipe(nlp):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy(nlp):
     ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
@@ -377,7 +375,6 @@ def test_entity_ruler_fuzzy(nlp):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy_disabled(nlp):
     @registry.misc("test_fuzzy_compare_disabled")
     def make_test_fuzzy_compare_disabled():

From 30aa5b821e6f7515dd63a4e0c750dc8e2655f2b2 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 13 Jan 2023 11:14:58 +0100
Subject: [PATCH 148/504] fix anchors (#12095)

---
 website/docs/api/stringstore.mdx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index d4d85e6d56a..269ac2d0c4b 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -97,7 +97,7 @@ Iterate over the stored strings in insertion order.
 | ----------- | ------------------------------ |
 | **RETURNS** | A string in the store. ~~str~~ |
 
-## StringStore.items {#iter tag="method" new="4"}
+## StringStore.items {id="items", tag="method", version="4"}
 
 Iterate over the stored string-hash pairs in insertion order.
 
@@ -113,7 +113,7 @@ Iterate over the stored string-hash pairs in insertion order.
 | ----------- | ------------------------------------------------------ |
 | **RETURNS** | A list of string-hash pairs. ~~List[Tuple[str, int]]~~ |
 
-## StringStore.keys {#iter tag="method" new="4"}
+## StringStore.keys {id="keys", tag="method", version="4"}
 
 Iterate over the stored strings in insertion order.
 
@@ -129,7 +129,7 @@ Iterate over the stored strings in insertion order.
 | ----------- | -------------------------------- |
 | **RETURNS** | A list of strings. ~~List[str]~~ |
 
-## StringStore.values {#iter tag="method" new="4"}
+## StringStore.values {id="values", tag="method", version="4"}
 
 Iterate over the stored string hashes in insertion order.
 

From 500711c8ace06d20db9a333e2289ebdc5bbefa78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 16 Jan 2023 10:25:53 +0100
Subject: [PATCH 149/504] Add
 `TrainablePipe.{distill,get_teacher_student_loss}` (#12016)

* Add `TrainablePipe.{distill,get_teacher_student_loss}`

This change adds two methods:

- `TrainablePipe::distill` which performs a training step of a
   student pipe on a teacher pipe, giving a batch of `Doc`s.
- `TrainablePipe::get_teacher_student_loss` computes the loss
  of a student relative to the teacher.

The `distill` or `get_teacher_student_loss` methods are also implemented
in the tagger, edit tree lemmatizer, and parser pipes, to enable
distillation in those pipes and as an example for other pipes.

* Fix stray `Beam` import

* Fix incorrect import

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* TrainablePipe.distill: use `Iterable[Example]`

* Add Pipe.is_distillable method

* Add `validate_distillation_examples`

This first calls `validate_examples` and then checks that the
student/teacher tokens are the same.

* Update distill documentation

* Add distill documentation for all pipes that support distillation

* Fix incorrect identifier

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Add comment to explain `is_distillable`

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/errors.py                               |   3 +
 spacy/ml/callbacks.py                         |   1 +
 spacy/pipeline/edit_tree_lemmatizer.py        |  19 +++
 spacy/pipeline/pipe.pyx                       |   4 +
 spacy/pipeline/tagger.pyx                     |  26 ++-
 spacy/pipeline/trainable_pipe.pyx             |  72 +++++++-
 spacy/pipeline/transition_parser.pyx          | 160 +++++++++++++++++-
 spacy/tests/parser/test_ner.py                |  46 +++++
 spacy/tests/parser/test_parse.py              |  49 ++++++
 .../pipeline/test_edit_tree_lemmatizer.py     |  47 +++++
 spacy/tests/pipeline/test_morphologizer.py    |   6 +
 spacy/tests/pipeline/test_senter.py           |   6 +
 spacy/tests/pipeline/test_tagger.py           |  46 +++++
 spacy/tests/pipeline/test_textcat.py          |   6 +
 spacy/tests/training/test_training.py         |  27 +--
 spacy/training/__init__.py                    |   3 +
 spacy/training/example.pyx                    |   7 +
 website/docs/api/dependencyparser.mdx         |  54 ++++++
 website/docs/api/edittreelemmatizer.mdx       |  54 ++++++
 website/docs/api/entityrecognizer.mdx         |  54 ++++++
 website/docs/api/morphologizer.mdx            |  54 ++++++
 website/docs/api/pipe.mdx                     |  61 +++++++
 website/docs/api/sentencerecognizer.mdx       |  54 ++++++
 website/docs/api/tagger.mdx                   |  54 ++++++
 website/docs/api/top-level.mdx                |   3 +-
 website/docs/usage/processing-pipelines.mdx   |  14 +-
 26 files changed, 906 insertions(+), 24 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 11b8980fd9d..9bdb66006e5 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -975,6 +975,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E4000 = ("Expected a Doc as input, but got: '{type}'")
     E4001 = ("Expected input to be one of the following types: ({expected_types}), "
              "but got '{received_type}'")
+    E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
+    E4003 = ("Training examples for distillation must have the exact same tokens in the "
+             "reference and predicted docs.")
 
 
 # fmt: on
diff --git a/spacy/ml/callbacks.py b/spacy/ml/callbacks.py
index e2378a7baf3..0783a5568a9 100644
--- a/spacy/ml/callbacks.py
+++ b/spacy/ml/callbacks.py
@@ -23,6 +23,7 @@
     "update",
     "rehearse",
     "get_loss",
+    "get_teacher_student_loss",
     "initialize",
     "begin_update",
     "finish_update",
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index f9a8ae10561..d5169178b8c 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -157,6 +157,25 @@ def get_loss(
 
         return float(loss), d_scores
 
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
     def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         n_docs = len(list(docs))
         if not any(len(doc) for doc in docs):
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index ea5fc5253d9..af7cd09f171 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -89,6 +89,10 @@ cdef class Pipe:
             return self.scorer(examples, **scorer_kwargs)
         return {}
 
+    @property
+    def is_distillable(self) -> bool:
+        return False
+
     @property
     def is_trainable(self) -> bool:
         return False
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index f25ee00407b..a8a89332bd4 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,5 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Tuple
 import numpy
 import srsly
 from thinc.api import Model, set_dropout_rate, Config
@@ -243,7 +244,6 @@ class Tagger(TrainablePipe):
 
         DOCS: https://spacy.io/api/tagger#rehearse
         """
-        loss_func = LegacySequenceCategoricalCrossentropy()
         if losses is None:
             losses = {}
         losses.setdefault(self.name, 0.0)
@@ -257,12 +257,32 @@ class Tagger(TrainablePipe):
         set_dropout_rate(self.model, drop)
         tag_scores, bp_tag_scores = self.model.begin_update(docs)
         tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
-        grads, loss = loss_func(tag_scores, tutor_tag_scores)
+        loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
         bp_tag_scores(grads)
-        self.finish_update(sgd)
+        if sgd is not None:
+            self.finish_update(sgd)
         losses[self.name] += loss
         return losses
 
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
     def get_loss(self, examples, scores):
         """Find the loss and gradient of loss for the batch of documents and
         their predicted scores.
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index bd360c9501b..3ec3e7551aa 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -7,7 +7,7 @@ import warnings
 
 from ..tokens.doc cimport Doc
 
-from ..training import validate_examples
+from ..training import validate_examples, validate_distillation_examples
 from ..errors import Errors, Warnings
 from .pipe import Pipe, deserialize_config
 from .. import util
@@ -59,7 +59,54 @@ cdef class TrainablePipe(Pipe):
         except Exception as e:
             error_handler(self.name, self, [doc], e)
 
-    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
+
+    def distill(self,
+               teacher_pipe: Optional["TrainablePipe"],
+               examples: Iterable["Example"],
+               *,
+               drop: float=0.0,
+               sgd: Optional[Optimizer]=None,
+               losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
+        """Train a pipe (the student) on the predictions of another pipe
+        (the teacher). The student is typically trained on the probability
+        distribution of the teacher, but details may differ per pipe.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
+            from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            and predicted docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+        
+        DOCS: https://spacy.io/api/pipe#distill
+        """
+        # By default we require a teacher pipe, but there are downstream
+        # implementations that don't require a pipe.
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        validate_distillation_examples(examples, "TrainablePipe.distill")
+        set_dropout_rate(self.model, drop)
+        for node in teacher_pipe.model.walk():
+            if node.name == "softmax":
+                node.attrs["softmax_normalize"] = True
+        teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples])
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+        bp_student_scores(d_scores)
+        if sgd is not None:
+            self.finish_update(sgd)
+        losses[self.name] += loss
+        return losses
+
+    def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
         """Apply the pipe to a stream of documents. This usually happens under
         the hood when the nlp object is called on a text and all components are
         applied to the Doc.
@@ -172,6 +219,19 @@ cdef class TrainablePipe(Pipe):
         """
         raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
 
+    def get_teacher_student_loss(self, teacher_scores, student_scores):
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
+        """
+        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
+
     def create_optimizer(self) -> Optimizer:
         """Create an optimizer for the pipeline component.
 
@@ -208,6 +268,14 @@ cdef class TrainablePipe(Pipe):
         """
         raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
 
+    @property
+    def is_distillable(self) -> bool:
+        # Normally a pipe overrides `get_teacher_student_loss` to implement
+        # distillation. In more exceptional cases, a pipe can provide its
+        # own `distill` implementation. If neither of these methods is
+        # overridden, the pipe does not implement distillation.
+        return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss)
+
     @property
     def is_trainable(self) -> bool:
         return True
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index d310df92151..feab7e7404b 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,7 +1,8 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
-
+from typing import Dict, Iterable, List, Optional, Tuple
+from cymem.cymem cimport Pool
 cimport numpy as np
 from cymem.cymem cimport Pool
 
@@ -14,7 +15,10 @@ from libcpp.vector cimport vector
 import random
 
 import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
+from thinc.api import chain, softmax_activation, use_ops
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d
 import numpy.random
 import numpy
 import numpy.random
@@ -34,6 +38,9 @@ from .trainable_pipe import TrainablePipe
 
 from ._parser_internals cimport _beam_utils
 
+from ..training import validate_examples, validate_get_examples
+from ..training import validate_distillation_examples
+from ..errors import Errors, Warnings
 from .. import util
 from ..errors import Errors
 from ..training import validate_examples, validate_get_examples
@@ -212,6 +219,121 @@ cdef class Parser(TrainablePipe):
         # Defined in subclasses, to avoid circular import
         raise NotImplementedError
 
+    def distill(self,
+               teacher_pipe: Optional[TrainablePipe],
+               examples: Iterable["Example"],
+               *,
+               drop: float=0.0,
+               sgd: Optional[Optimizer]=None,
+               losses: Optional[Dict[str, float]]=None):
+        """Train a pipe (the student) on the predictions of another pipe
+        (the teacher). The student is trained on the transition probabilities
+        of the teacher.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
+            from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            and predicted docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+        
+        DOCS: https://spacy.io/api/dependencyparser#distill
+        """
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+
+        validate_distillation_examples(examples, "TransitionParser.distill")
+
+        set_dropout_rate(self.model, drop)
+
+        student_docs = [eg.predicted for eg in examples]
+
+        teacher_step_model = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_step_model, backprop_tok2vec = self.model.begin_update(student_docs)
+
+        # Add softmax activation, so that we can compute student losses
+        # with cross-entropy loss.
+        with use_ops("numpy"):
+            teacher_model = chain(teacher_step_model, softmax_activation())
+            student_model = chain(student_step_model, softmax_activation())
+        
+        max_moves = self.cfg["update_with_oracle_cut_size"]
+        if max_moves >= 1:
+            # Chop sequences into lengths of this many words, to make the
+            # batch uniform length. Since we do not have a gold standard
+            # sequence, we use the teacher's predictions as the gold
+            # standard.
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            states = self._init_batch(teacher_step_model, student_docs, max_moves)
+        else:
+            states = self.moves.init_batch(student_docs)
+
+        loss = 0.0
+        n_moves = 0
+        while states:
+            # We do distillation as follows: (1) for every state, we compute the
+            # transition softmax distributions: (2) we backpropagate the error of
+            # the student (compared to the teacher) into the student model; (3)
+            # for all states, we move to the next state using the student's
+            # predictions.
+            teacher_scores = teacher_model.predict(states)
+            student_scores, backprop = student_model.begin_update(states)
+            state_loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+            backprop(d_scores)
+            loss += state_loss
+            self.transition_states(states, student_scores)
+            states = [state for state in states if not state.is_final()]
+
+            # Stop when we reach the maximum number of moves, otherwise we start
+            # to process the remainder of cut sequences again.
+            if max_moves >= 1 and n_moves >= max_moves:
+                break
+            n_moves += 1
+
+        backprop_tok2vec(student_docs)
+
+        if sgd is not None:
+            self.finish_update(sgd)
+
+        losses[self.name] += loss
+
+        del backprop
+        del backprop_tok2vec
+        teacher_step_model.clear_memory()
+        student_step_model.clear_memory()
+        del teacher_model
+        del student_model
+
+        return losses
+
+
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
     def init_multitask_objectives(self, get_examples, pipeline, **cfg):
         """Setup models for secondary objectives, to benefit from multi-task
         learning. This method is intended to be overridden by subclasses.
@@ -645,6 +767,40 @@ cdef class Parser(TrainablePipe):
                     raise ValueError(Errors.E149) from None
         return self
 
+    def _init_batch(self, teacher_step_model, docs, max_length):
+        """Make a square batch of length equal to the shortest transition
+        sequence or a cap. A long
+        doc will get multiple states. Let's say we have a doc of length 2*N,
+        where N is the shortest doc. We'll make two states, one representing
+        long_doc[:N], and another representing long_doc[N:]. In contrast to
+        _init_gold_batch, this version uses a teacher model to generate the
+        cut sequences."""
+        cdef:
+            StateClass start_state
+            StateClass state
+            Transition action
+        all_states = self.moves.init_batch(docs)
+        states = []
+        to_cut = []
+        for state, doc in zip(all_states, docs):
+            if not state.is_final():
+                if len(doc) < max_length:
+                    states.append(state)
+                else:
+                    to_cut.append(state)
+        while to_cut:
+            states.extend(state.copy() for state in to_cut)
+            # Move states forward max_length actions.
+            length = 0
+            while to_cut and length < max_length:
+                teacher_scores = teacher_step_model.predict(to_cut)
+                self.transition_states(to_cut, teacher_scores)
+                # States that are completed do not need further cutting.
+                to_cut = [state for state in to_cut if not state.is_final()]
+                length += 1
+        return states
+
+
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
         sequence or a cap. A long
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 1509c31bbba..54ee053981f 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -617,6 +617,52 @@ def test_overfitting_IO(use_upper):
     assert ents[1].kb_id == 0
 
 
+def test_is_distillable():
+    nlp = English()
+    ner = nlp.add_pipe("ner")
+    assert ner.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_ner = teacher.add_pipe("ner")
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
+        for ent in annotations.get("entities"):
+            teacher_ner.add_label(ent[2])
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["ner"] < 0.00001
+
+    student = English()
+    student_ner = student.add_pipe("ner")
+    student_ner.initialize(
+        get_examples=lambda: train_examples, labels=teacher_ner.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(100):
+        losses = {}
+        student_ner.distill(teacher_ner, distill_examples, sgd=optimizer, losses=losses)
+    assert losses["ner"] < 0.0001
+
+    # test the trained model
+    test_text = "I like London."
+    doc = student(test_text)
+    ents = doc.ents
+    assert len(ents) == 1
+    assert ents[0].text == "London"
+    assert ents[0].label_ == "LOC"
+
+
 def test_beam_ner_scores():
     # Test that we can get confidence values out of the beam_ner pipe
     beam_width = 16
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 3565c62af0f..a943c3538e0 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -396,6 +396,55 @@ def test_overfitting_IO(pipe_name):
     assert_equal(batch_deps_1, no_batch_deps)
 
 
+def test_is_distillable():
+    nlp = English()
+    parser = nlp.add_pipe("parser")
+    assert parser.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_parser = teacher.add_pipe("parser")
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
+        for dep in annotations.get("deps", []):
+            teacher_parser.add_label(dep)
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(200):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["parser"] < 0.0001
+
+    student = English()
+    student_parser = student.add_pipe("parser")
+    student_parser.initialize(
+        get_examples=lambda: train_examples, labels=teacher_parser.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(200):
+        losses = {}
+        student_parser.distill(
+            teacher_parser, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["parser"] < 0.0001
+
+    test_text = "I like securities."
+    doc = student(test_text)
+    assert doc[0].dep_ == "nsubj"
+    assert doc[2].dep_ == "dobj"
+    assert doc[3].dep_ == "punct"
+    assert doc[0].head.i == 1
+    assert doc[2].head.i == 1
+    assert doc[3].head.i == 1
+
+
 # fmt: off
 @pytest.mark.slow
 @pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index ba2ed4e5ff3..0f204ead477 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -214,6 +214,53 @@ def test_overfitting_IO(top_k):
     assert doc4[3].lemma_ == "egg"
 
 
+def test_is_distillable():
+    nlp = English()
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    assert lemmatizer.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_lemmatizer = teacher.add_pipe("trainable_lemmatizer")
+    teacher_lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["trainable_lemmatizer"] < 0.00001
+
+    student = English()
+    student_lemmatizer = student.add_pipe("trainable_lemmatizer")
+    student_lemmatizer.min_tree_freq = 1
+    student_lemmatizer.initialize(
+        get_examples=lambda: train_examples, labels=teacher_lemmatizer.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(50):
+        losses = {}
+        student_lemmatizer.distill(
+            teacher_lemmatizer, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["trainable_lemmatizer"] < 0.00001
+
+    test_text = "She likes blue eggs"
+    doc = student(test_text)
+    assert doc[0].lemma_ == "she"
+    assert doc[1].lemma_ == "like"
+    assert doc[2].lemma_ == "blue"
+    assert doc[3].lemma_ == "egg"
+
+
 def test_lemmatizer_requires_labels():
     nlp = English()
     nlp.add_pipe("trainable_lemmatizer")
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index c2b65977ac3..fffb7b4ed7f 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -77,6 +77,12 @@ def test_implicit_label():
     nlp.initialize(get_examples=lambda: train_examples)
 
 
+def test_is_distillable():
+    nlp = English()
+    morphologizer = nlp.add_pipe("morphologizer")
+    assert morphologizer.is_distillable
+
+
 def test_no_resize():
     nlp = Language()
     morphologizer = nlp.add_pipe("morphologizer")
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 2e40d86ff48..94285178310 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -11,6 +11,12 @@
 from spacy.training import Example
 
 
+def test_is_distillable():
+    nlp = English()
+    senter = nlp.add_pipe("senter")
+    assert senter.is_distillable
+
+
 def test_label_types():
     nlp = Language()
     senter = nlp.add_pipe("senter")
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 5deb323dd71..5da5c209975 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -237,6 +237,52 @@ def test_overfitting_IO():
     assert doc3[0].tag_ != "N"
 
 
+def test_is_distillable():
+    nlp = English()
+    tagger = nlp.add_pipe("tagger")
+    assert tagger.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_tagger = teacher.add_pipe("tagger")
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["tagger"] < 0.00001
+
+    student = English()
+    student_tagger = student.add_pipe("tagger")
+    student_tagger.min_tree_freq = 1
+    student_tagger.initialize(
+        get_examples=lambda: train_examples, labels=teacher_tagger.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(50):
+        losses = {}
+        student_tagger.distill(
+            teacher_tagger, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["tagger"] < 0.00001
+
+    test_text = "I like blue eggs"
+    doc = student(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+
 def test_save_activations():
     # Test if activations are correctly added to Doc when requested.
     nlp = English()
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 710dac0571d..214c1bfbed1 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -602,6 +602,12 @@ def test_initialize_examples(name, get_examples, train_data):
         nlp.initialize(get_examples=get_examples())
 
 
+def test_is_distillable():
+    nlp = English()
+    textcat = nlp.add_pipe("textcat")
+    assert not textcat.is_distillable
+
+
 def test_overfitting_IO():
     # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
     fix_random_seed(0)
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index a492a8be358..68f7e8a0d57 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -8,16 +8,10 @@
 import spacy
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
-from spacy.training import (
-    Alignment,
-    Corpus,
-    Example,
-    biluo_tags_to_offsets,
-    biluo_tags_to_spans,
-    docs_to_json,
-    iob_to_biluo,
-    offsets_to_biluo_tags,
-)
+from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
+from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
+from spacy.training import offsets_to_biluo_tags, validate_distillation_examples
+from spacy.training.alignment_array import AlignmentArray
 from spacy.training.align import get_alignments
 from spacy.training.alignment_array import AlignmentArray
 from spacy.training.converters import json_to_docs
@@ -377,6 +371,19 @@ def test_example_from_dict_some_ner(en_vocab):
     assert ner_tags == ["U-LOC", None, None, None]
 
 
+def test_validate_distillation_examples(en_vocab):
+    words = ["a", "b", "c", "d"]
+    spaces = [True, True, False, True]
+    predicted = Doc(en_vocab, words=words, spaces=spaces)
+
+    example = Example.from_dict(predicted, {})
+    validate_distillation_examples([example], "test_validate_distillation_examples")
+
+    example = Example.from_dict(predicted, {"words": words + ["e"]})
+    with pytest.raises(ValueError, match=r"distillation"):
+        validate_distillation_examples([example], "test_validate_distillation_examples")
+
+
 @pytest.mark.filterwarnings("ignore::UserWarning")
 def test_json_to_docs_no_ner(en_vocab):
     data = [
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 5c2ba99320d..358b2bd806d 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,3 +1,6 @@
+from .corpus import Corpus, JsonlCorpus  # noqa: F401
+from .example import Example, validate_examples, validate_get_examples  # noqa: F401
+from .example import validate_distillation_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index abdcecf71d1..914e877f579 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -57,6 +57,13 @@ def validate_examples(examples, method):
         raise TypeError(err)
 
 
+def validate_distillation_examples(examples, method):
+    validate_examples(examples, method)
+    for eg in examples:
+        if [token.text for token in eg.reference] != [token.text for token in eg.predicted]:
+            raise ValueError(Errors.E4003)
+
+
 def validate_get_examples(get_examples, method):
     """Check that a generator of a batch of examples received during processing is valid:
     the callable produces a non-empty list of Example objects.
diff --git a/website/docs/api/dependencyparser.mdx b/website/docs/api/dependencyparser.mdx
index 771a00aeee1..5179ce48b84 100644
--- a/website/docs/api/dependencyparser.mdx
+++ b/website/docs/api/dependencyparser.mdx
@@ -131,6 +131,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## DependencyParser.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("parser")
+> student_pipe = student.add_pipe("parser")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## DependencyParser.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -268,6 +301,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions. ~~StateClass~~                 |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## DependencyParser.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_parser = teacher.get_pipe("parser")
+> student_parser = student.add_pipe("parser")
+> student_scores = student_parser.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_parser.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_parser.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## DependencyParser.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline
diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx
index 17af19e8c38..2e099365758 100644
--- a/website/docs/api/edittreelemmatizer.mdx
+++ b/website/docs/api/edittreelemmatizer.mdx
@@ -115,6 +115,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## EditTreeLemmatizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("trainable_lemmatizer")
+> student_pipe = student.add_pipe("trainable_lemmatizer")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## EditTreeLemmatizer.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -269,6 +302,27 @@ Create an optimizer for the pipeline component.
 | ----------- | ---------------------------- |
 | **RETURNS** | The optimizer. ~~Optimizer~~ |
 
+## EditTreeLemmatizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_lemmatizer = teacher.get_pipe("trainable_lemmatizer")
+> student_lemmatizer = student.add_pipe("trainable_lemmatizer")
+> student_scores = student_lemmatizer.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_lemmatizer.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_lemmatizer.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## EditTreeLemmatizer.use_params {id="use_params",tag="method, contextmanager"}
 
 Modify the pipe's model, to use the given parameter values. At the end of the
diff --git a/website/docs/api/entityrecognizer.mdx b/website/docs/api/entityrecognizer.mdx
index 1f386bbb6ff..005d5d11deb 100644
--- a/website/docs/api/entityrecognizer.mdx
+++ b/website/docs/api/entityrecognizer.mdx
@@ -127,6 +127,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## EntityRecognizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("ner")
+> student_pipe = student.add_pipe("ner")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## EntityRecognizer.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -264,6 +297,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions. ~~StateClass~~                 |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## EntityRecognizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_ner = teacher.get_pipe("ner")
+> student_ner = student.add_pipe("ner")
+> student_scores = student_ner.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_ner.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_ner.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## EntityRecognizer.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 1fda807cb32..4f79458d319 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -121,6 +121,39 @@ delegate to the [`predict`](/api/morphologizer#predict) and
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## Morphologizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("morphologizer")
+> student_pipe = student.add_pipe("morphologizer")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## Morphologizer.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -259,6 +292,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## Morphologizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_morphologizer = teacher.get_pipe("morphologizer")
+> student_morphologizer = student.add_pipe("morphologizer")
+> student_scores = student_morphologizer.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_morphologizer.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_morphologizer.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## Morphologizer.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx
index b387ea58654..120c8f6908f 100644
--- a/website/docs/api/pipe.mdx
+++ b/website/docs/api/pipe.mdx
@@ -234,6 +234,39 @@ predictions and gold-standard annotations, and update the component's model.
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
 | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
 
+## TrainablePipe.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("your_custom_pipe")
+> student_pipe = student.add_pipe("your_custom_pipe")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## TrainablePipe.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
@@ -281,6 +314,34 @@ This method needs to be overwritten with your own custom `get_loss` method.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## TrainablePipe.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+<Infobox variant="danger">
+
+This method needs to be overwritten with your own custom
+`get_teacher_student_loss` method.
+
+</Infobox>
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.get_pipe("your_custom_pipe")
+> student_pipe = student.add_pipe("your_custom_pipe")
+> student_scores = student_pipe.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_pipe.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_pipe.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## TrainablePipe.score {id="score",tag="method",version="3"}
 
 Score a batch of examples.
diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx
index d5d096d7659..02fd57102e2 100644
--- a/website/docs/api/sentencerecognizer.mdx
+++ b/website/docs/api/sentencerecognizer.mdx
@@ -106,6 +106,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## SentenceRecognizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("senter")
+> student_pipe = student.add_pipe("senter")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## SentenceRecognizer.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -254,6 +287,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## SentenceRecognizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_senter = teacher.get_pipe("senter")
+> student_senter = student.add_pipe("senter")
+> student_scores = student_senter.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_senter.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_senter.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## SentenceRecognizer.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index ae14df212ee..664fd7940c1 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -105,6 +105,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## Tagger.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("tagger")
+> student_pipe = student.add_pipe("tagger")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## Tagger.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -265,6 +298,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## Tagger.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_tagger = teacher.get_pipe("tagger")
+> student_tagger = student.add_pipe("tagger")
+> student_scores = student_tagger.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_tagger.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_tagger.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## Tagger.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 9cdc0c8ab02..77216924405 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -1048,7 +1048,8 @@ backprop passes.
 Recursively wrap both the models and methods of each pipe using
 [NVTX](https://nvidia.github.io/NVTX/) range markers. By default, the following
 methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`,
-`get_loss`, `initialize`, `begin_update`, `finish_update`, `update`.
+`get_loss`, `get_teacher_student_loss`, `initialize`, `begin_update`,
+`finish_update`, `update`.
 
 | Name                        | Description                                                                                                                                                     |
 | --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index fb5de5da102..9dbdadd0ebc 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -1355,12 +1355,14 @@ For some use cases, it makes sense to also overwrite additional methods to
 customize how the model is updated from examples, how it's initialized, how the
 loss is calculated and to add evaluation scores to the training output.
 
-| Name                                 | Description                                                                                                                                                                                                                                                                                                                                   |
-| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`update`](/api/pipe#update)         | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                                           |
-| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
-| [`get_loss`](/api/pipe#get_loss)     | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                                                 |
-| [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_score_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score.                            |
+| Name                                                             | Description                                                                                                                                                                                                                                                                                                                                   |
+| ---------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`update`](/api/pipe#update)                                     | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                                           |
+| [`distill`](/api/pipe#distill)                                   | Learn from a teacher pipeline using a batch of [`Doc`](/api/doc) objects and update the component's model.                                                                                                                                                                                                                                    |
+| [`initialize`](/api/pipe#initialize)                             | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
+| [`get_loss`](/api/pipe#get_loss)                                 | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                                                 |
+| [`get_teacher_student_loss`](/api/pipe#get_teacher_student_loss) | Return a tuple of the loss and the gradient for the student scores relative to the teacher scores.                                                                                                                                                                                                                                            |
+| [`score`](/api/pipe#score)                                       | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_score_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score.                            |
 
 <Infobox title="Custom trainable components and models" emoji="📖">
 

From eec5c43d840586960261daca692e6bf69e9cba9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 18 Jan 2023 11:27:45 +0100
Subject: [PATCH 150/504] Merge the parser refactor into `v4` (#10940)

* Try to fix doc.copy

* Set dev version

* Make vocab always own lexemes

* Change version

* Add SpanGroups.copy method

* Fix set_annotations during Parser.update

* Fix dict proxy copy

* Upd version

* Fix copying SpanGroups

* Fix set_annotations in parser.update

* Fix parser set_annotations during update

* Revert "Fix parser set_annotations during update"

This reverts commit eb138c89edb306608826dca50619ea8a60de2b14.

* Revert "Fix set_annotations in parser.update"

This reverts commit c6df0eafd0046179c1c9fb7840074edf04e4721d.

* Fix set_annotations during parser update

* Inc version

* Handle final states in get_oracle_sequence

* Inc version

* Try to fix parser training

* Inc version

* Fix

* Inc version

* Fix parser oracle

* Inc version

* Inc version

* Fix transition has_gold

* Inc version

* Try to use real histories, not oracle

* Inc version

* Upd parser

* Inc version

* WIP on rewrite parser

* WIP refactor parser

* New progress on parser model refactor

* Prepare to remove parser_model.pyx

* Convert parser from cdef class

* Delete spacy.ml.parser_model

* Delete _precomputable_affine module

* Wire up tb_framework to new parser model

* Wire up parser model

* Uncython ner.pyx and dep_parser.pyx

* Uncython

* Work on parser model

* Support unseen_classes in parser model

* Support unseen classes in parser

* Cleaner handling of unseen classes

* Work through tests

* Keep working through errors

* Keep working through errors

* Work on parser. 15 tests failing

* Xfail beam stuff. 9 failures

* More xfail. 7 failures

* Xfail. 6 failures

* cleanup

* formatting

* fixes

* pass nO through

* Fix empty doc in update

* Hackishly fix resizing. 3 failures

* Fix redundant test. 2 failures

* Add reference version

* black formatting

* Get tests passing with reference implementation

* Fix missing prints

* Add missing file

* Improve indexing on reference implementation

* Get non-reference forward func working

* Start rigging beam back up

* removing redundant tests, cf #8106

* black formatting

* temporarily xfailing issue 4314

* make flake8 happy again

* mypy fixes

* ensure labels are added upon predict

* cleanup remnants from merge conflicts

* Improve unseen label masking

Two changes to speed up masking by ~10%:

- Use a bool array rather than an array of float32.

- Let the mask indicate whether a label was seen, rather than
  unseen. The mask is most frequently used to index scores for
  seen labels. However, since the mask marked unseen labels,
  this required computing an intermittent flipped mask.

* Write moves costs directly into numpy array (#10163)

This avoids elementwise indexing and the allocation of an additional
array.

Gives a ~15% speed improvement when using batch_by_sequence with size
32.

* Temporarily disable ner and rehearse tests

Until rehearse is implemented again in the refactored parser.

* Fix loss serialization issue (#10600)

* Fix loss serialization issue

Serialization of a model fails with:

TypeError: array(738.3855, dtype=float32) is not JSON serializable

Fix this using float conversion.

* Disable CI steps that require spacy.TransitionBasedParser.v2

After finishing the refactor, TransitionBasedParser.v2 should be
provided for backwards compat.

* Add back support for beam parsing to the refactored parser (#10633)

* Add back support for beam parsing

Beam parsing was already implemented as part of the `BeamBatch` class.
This change makes its counterpart `GreedyBatch`. Both classes are hooked
up in `TransitionModel`, selecting `GreedyBatch` when the beam size is
one, or `BeamBatch` otherwise.

* Use kwarg for beam width

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Avoid implicit default for beam_width and beam_density

* Parser.{beam,greedy}_parse: ensure labels are added

* Remove 'deprecated' comments

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Parser `StateC` optimizations (#10746)

* `StateC`: Optimizations

Avoid GIL acquisition in `__init__`
Increase default buffer capacities on init
Reduce C++ exception overhead

* Fix typo

* Replace `set::count` with `set::find`

* Add exception attribute to c'tor

* Remove unused import

* Use a power-of-two value for initial capacity
Use default-insert to init `_heads` and `_unshiftable`

* Merge `cdef` variable declarations and assignments

* Vectorize `example.get_aligned_parses` (#10789)

* `example`: Vectorize `get_aligned_parse`
Rename `numpy` import

* Convert aligned array to lists before returning

* Revert import renaming

* Elide slice arguments when selecting the entire range

* Tagger/morphologizer alignment performance optimizations (#10798)

* `example`: Unwrap `numpy` scalar arrays before passing them to `StringStore.__getitem__`

* `AlignmentArray`: Use native list as staging buffer for offset calculation

* `example`: Vectorize `get_aligned`

* Hoist inner functions out of `get_aligned`

* Replace inline `if..else` clause in assignment statement

* `AlignmentArray`: Use raw indexing into offset and data `numpy` arrays

* `example`: Replace array unique value check with `groupby`

* `example`: Correctly exclude tokens with no alignment in `_get_aligned_vectorized`
Simplify `_get_aligned_non_vectorized`

* `util`: Update `all_equal` docstring

* Explicitly use `int32_t*`

* Restore C CPU inference in the refactored parser (#10747)

* Bring back the C parsing model

The C parsing model is used for CPU inference and is still faster for
CPU inference than the forward pass of the Thinc model.

* Use C sgemm provided by the Ops implementation

* Make tb_framework module Cython, merge in C forward implementation

* TransitionModel: raise in backprop returned from forward_cpu

* Re-enable greedy parse test

* Return transition scores when forward_cpu is used

* Apply suggestions from code review

Import `Model` from `thinc.api`

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Use relative imports in tb_framework

* Don't assume a default for beam_width

* We don't have a direct dependency on BLIS anymore

* Rename forwards to _forward_{fallback,greedy_cpu}

* Require thinc >=8.1.0,<8.2.0

* tb_framework: clean up imports

* Fix return type of _get_seen_mask

* Move up _forward_greedy_cpu

* Style fixes.

* Lower thinc lowerbound to 8.1.0.dev0

* Formatting fix

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Reimplement parser rehearsal function (#10878)

* Reimplement parser rehearsal function

Before the parser refactor, rehearsal was driven by a loop in the
`rehearse` method itself. For each parsing step, the loops would:

1. Get the predictions of the teacher.
2. Get the predictions and backprop function of the student.
3. Compute the loss and backprop into the student.
4. Move the teacher and student forward with the predictions of
   the student.

In the refactored parser, we cannot perform search stepwise rehearsal
anymore, since the model now predicts all parsing steps at once.
Therefore, rehearsal is performed in the following steps:

1. Get the predictions of all parsing steps from the student, along
   with its backprop function.
2. Get the predictions from the teacher, but use the predictions of
   the student to advance the parser while doing so.
3. Compute the loss and backprop into the student.

To support the second step a new method, `advance_with_actions` is
added to `GreedyBatch`, which performs the provided parsing steps.

* tb_framework: wrap upper_W and upper_b in Linear

Thinc's Optimizer cannot handle resizing of existing parameters. Until
it does, we work around this by wrapping the weights/biases of the upper
layer of the parser model in Linear. When the upper layer is resized, we
copy over the existing parameters into a new Linear instance. This does
not trigger an error in Optimizer, because it sees the resized layer as
a new set of parameters.

* Add test for TransitionSystem.apply_actions

* Better FIXME marker

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Fixes from Madeesh

* Apply suggestions from Sofie

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remove useless assignment

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename some identifiers in the parser refactor (#10935)

* Rename _parseC to _parse_batch

* tb_framework: prefix many auxiliary functions with underscore

To clearly state the intent that they are private.

* Rename `lower` to `hidden`, `upper` to `output`

* Parser slow test fixup

We don't have TransitionBasedParser.{v1,v2} until we bring it back as a
legacy option.

* Remove last vestiges of PrecomputableAffine

This does not exist anymore as a separate layer.

* ner: re-enable sentence boundary checks

* Re-enable test that works now.

* test_ner: make loss test more strict again

* Remove commented line

* Re-enable some more beam parser tests

* Remove unused _forward_reference function

* Update for CBlas changes in Thinc 8.1.0.dev2

Bump thinc dependency to 8.1.0.dev3.

* Remove references to spacy.TransitionBasedParser.{v1,v2}

Since they will not be offered starting with spaCy v4.

* `tb_framework`: Replace references to `thinc.backends.linalg` with `CBlas`

* dont use get_array_module (#11056) (#11293)

Co-authored-by: kadarakos <kadar.akos@gmail.com>

* Move `thinc.extra.search` to `spacy.pipeline._parser_internals` (#11317)

* `search`: Move from `thinc.extra.search`
Fix NPE in `Beam.__dealloc__`

* `pytest`: Add support for executing Cython tests
Move `search` tests from thinc and patch them to run with `pytest`

* `mypy` fix

* Update comment

* `conftest`: Expose `register_cython_tests`

* Remove unused import

* Move `argmax` impls to new `_parser_utils` Cython module (#11410)

* Parser does not have to be a cdef class anymore

This also fixes validation of the initialization schema.

* Add back spacy.TransitionBasedParser.v2

* Fix a rename that was missed in #10878.

So that rehearsal tests pass.

* Remove module from setup.py that got added during the merge

* Bring back support for `update_with_oracle_cut_size` (#12086)

* Bring back support for `update_with_oracle_cut_size`

This option was available in the pre-refactor parser, but was never
implemented in the refactored parser. This option cuts transition
sequences that are longer than `update_with_oracle_cut` size into
separate sequences that have at most `update_with_oracle_cut`
transitions. The oracle (gold standard) transition sequence is used to
determine the cuts and the initial states for the additional sequences.

Applying this cut makes the batches more homogeneous in the transition
sequence lengths, making forward passes (and as a consequence training)
much faster.

Training time 1000 steps on de_core_news_lg:

- Before this change: 149s
- After this change: 68s
- Pre-refactor parser: 81s

* Fix a rename that was missed in #10878.

So that rehearsal tests pass.

* Apply suggestions from @shadeMe

* Use chained conditional

* Test with update_with_oracle_cut_size={0, 1, 5, 100}

And fix a git that occurs with a cut size of 1.

* Fix up some merge fall out

* Update parser distillation for the refactor

In the old parser, we'd iterate over the transitions in the distill
function and compute the loss/gradients on the go. In the refactored
parser, we first let the student model parse the inputs. Then we'll let
the teacher compute the transition probabilities of the states in the
student's transition sequence. We can then compute the gradients of the
student given the teacher.

* Add back spacy.TransitionBasedParser.v1 references

- Accordion in the architecture docs.
- Test in test_parse, but disabled until we have a spacy-legacy release.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: kadarakos <kadar.akos@gmail.com>
---
 setup.py                                      |   6 +-
 spacy/cli/templates/quickstart_training.jinja |  12 +-
 spacy/errors.py                               |   3 +
 spacy/ml/_precomputable_affine.py             | 164 -----
 spacy/ml/models/parser.py                     | 174 ++---
 spacy/ml/parser_model.pxd                     |  55 --
 spacy/ml/parser_model.pyx                     | 539 ---------------
 spacy/ml/tb_framework.pxd                     |  28 +
 spacy/ml/tb_framework.py                      |  51 --
 spacy/ml/tb_framework.pyx                     | 621 ++++++++++++++++++
 .../_parser_internals/_beam_utils.pyx         |   3 +-
 .../_parser_internals/_parser_utils.pxd       |   2 +
 .../_parser_internals/_parser_utils.pyx       |  22 +
 spacy/pipeline/_parser_internals/_state.pxd   |  59 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 +
 spacy/pipeline/_parser_internals/batch.pxd    |   2 +
 spacy/pipeline/_parser_internals/batch.pyx    |  52 ++
 spacy/pipeline/_parser_internals/ner.pyx      |   3 +
 .../pipeline/_parser_internals/stateclass.pyx |   7 +
 .../_parser_internals/transition_system.pxd   |   7 +
 .../_parser_internals/transition_system.pyx   |  71 ++
 .../{dep_parser.pyx => dep_parser.py}         |  17 +-
 spacy/pipeline/{ner.pyx => ner.py}            |  33 +-
 spacy/pipeline/transition_parser.pxd          |  31 -
 spacy/pipeline/transition_parser.pyx          | 509 ++++++--------
 spacy/tests/parser/test_ner.py                |  11 +-
 spacy/tests/parser/test_parse.py              |  80 ++-
 spacy/tests/pipeline/test_tok2vec.py          |   2 +-
 .../tests/serialize/test_serialize_config.py  |  56 +-
 spacy/tests/test_misc.py                      |  53 +-
 spacy/training/example.pyx                    |   1 -
 website/docs/api/architectures.mdx            |  26 +-
 website/docs/api/cli.mdx                      |   8 +-
 website/docs/api/legacy.mdx                   |   2 +-
 .../docs/usage/embeddings-transformers.mdx    |   6 +-
 35 files changed, 1293 insertions(+), 1426 deletions(-)
 delete mode 100644 spacy/ml/_precomputable_affine.py
 delete mode 100644 spacy/ml/parser_model.pxd
 delete mode 100644 spacy/ml/parser_model.pyx
 create mode 100644 spacy/ml/tb_framework.pxd
 delete mode 100644 spacy/ml/tb_framework.py
 create mode 100644 spacy/ml/tb_framework.pyx
 create mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pxd
 create mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pyx
 create mode 100644 spacy/pipeline/_parser_internals/batch.pxd
 create mode 100644 spacy/pipeline/_parser_internals/batch.pyx
 rename spacy/pipeline/{dep_parser.pyx => dep_parser.py} (97%)
 rename spacy/pipeline/{ner.pyx => ner.py} (93%)
 delete mode 100644 spacy/pipeline/transition_parser.pxd

diff --git a/setup.py b/setup.py
index 0eb529c2098..a4a87d68aec 100755
--- a/setup.py
+++ b/setup.py
@@ -32,12 +32,10 @@
     "spacy.kb.candidate",
     "spacy.kb.kb",
     "spacy.kb.kb_in_memory",
-    "spacy.ml.parser_model",
+    "spacy.ml.tb_framework",
     "spacy.morphology",
-    "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
-    "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
     "spacy.pipeline.sentencizer",
@@ -45,6 +43,7 @@
     "spacy.pipeline.tagger",
     "spacy.pipeline.transition_parser",
     "spacy.pipeline._parser_internals.arc_eager",
+    "spacy.pipeline._parser_internals.batch",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
     "spacy.pipeline._parser_internals.search",
@@ -52,6 +51,7 @@
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.pipeline._parser_internals._beam_utils",
+    "spacy.pipeline._parser_internals._parser_utils",
     "spacy.tokenizer",
     "spacy.training.align",
     "spacy.training.gold_io",
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 2817147f3e9..36325711d4d 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -90,12 +90,11 @@ grad_factor = 1.0
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = false
 nO = null
 
 [components.parser.model.tok2vec]
@@ -111,12 +110,11 @@ grad_factor = 1.0
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = false
 nO = null
 
 [components.ner.model.tok2vec]
@@ -387,12 +385,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = true
 nO = null
 
 [components.parser.model.tok2vec]
@@ -405,12 +402,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
diff --git a/spacy/errors.py b/spacy/errors.py
index 9bdb66006e5..9074a3fead8 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -215,6 +215,8 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
+    W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
+
 
 class Errors(metaclass=ErrorsWithCodes):
     E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
@@ -978,6 +980,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
     E4003 = ("Training examples for distillation must have the exact same tokens in the "
              "reference and predicted docs.")
+    E4004 = ("Backprop is not supported when is_train is not set.")
 
 
 # fmt: on
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
deleted file mode 100644
index 1c20c622b2c..00000000000
--- a/spacy/ml/_precomputable_affine.py
+++ /dev/null
@@ -1,164 +0,0 @@
-from thinc.api import Model, normal_init
-
-from ..util import registry
-
-
-@registry.layers("spacy.PrecomputableAffine.v1")
-def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
-    model = Model(
-        "precomputable_affine",
-        forward,
-        init=init,
-        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
-        params={"W": None, "b": None, "pad": None},
-        attrs={"dropout_rate": dropout},
-    )
-    return model
-
-
-def forward(model, X, is_train):
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.get_param("W")
-    # Preallocate array for layer output, including padding.
-    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
-    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
-    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
-
-    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
-    # change its shape to (nF, nO, nP) without breaking existing models. So
-    # we'll squeeze the first dimension here.
-    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
-
-    def backward(dY_ids):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nO, nP), and get back:
-        # (nB, nO, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nO, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        assert dY.ndim == 3
-        assert dY.shape[1] == nO, dY.shape
-        assert dY.shape[2] == nP, dY.shape
-        # nB = dY.shape[0]
-        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
-        Xf = X[ids]
-        Xf = Xf.reshape((Xf.shape[0], nF * nI))
-
-        model.inc_grad("b", dY.sum(axis=0))
-        dY = dY.reshape((dY.shape[0], nO * nP))
-
-        Wopfi = W.transpose((1, 2, 0, 3))
-        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
-        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
-
-        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
-        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
-        # (o, p, f, i) --> (f, o, p, i)
-        dWopfi = dWopfi.transpose((2, 0, 1, 3))
-        model.inc_grad("W", dWopfi)
-        return dXf.reshape((dXf.shape[0], nF, nI))
-
-    return Yf, backward
-
-
-def _backprop_precomputable_affine_padding(model, dY, ids):
-    nB = dY.shape[0]
-    nF = model.get_dim("nF")
-    nP = model.get_dim("nP")
-    nO = model.get_dim("nO")
-    # Backprop the "padding", used as a filler for missing values.
-    # Values that are missing are set to -1, and each state vector could
-    # have multiple missing values. The padding has different values for
-    # different missing features. The gradient of the padding vector is:
-    #
-    # for b in range(nB):
-    #     for f in range(nF):
-    #         if ids[b, f] < 0:
-    #             d_pad[f] += dY[b]
-    #
-    # Which can be rewritten as:
-    #
-    # (ids < 0).T @ dY
-    mask = model.ops.asarray(ids < 0, dtype="f")
-    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
-    return d_pad.reshape((1, nF, nO, nP))
-
-
-def init(model, X=None, Y=None):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    if model.has_param("W") and model.get_param("W").any():
-        return
-
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nO, nP, nI)
-    b = model.ops.alloc2f(nO, nP)
-    pad = model.ops.alloc4f(1, nF, nO, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
-        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors = vectors.reshape((vectors.shape[0], nO, nP))
-        vectors += b
-        vectors = model.ops.asarray(vectors)
-        if nP >= 2:
-            return model.ops.maxout(vectors)[0]
-        else:
-            return vectors * (vectors >= 0)
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = model.get_param("W").copy()
-    b = model.get_param("b").copy()
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("b", b)
-        else:
-            break
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index f6c0e565dd3..59483839206 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,18 +1,22 @@
-from typing import List, Optional, cast
-
-from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
+from typing import Optional, List, Tuple, Any
 from thinc.types import Floats2d
+from thinc.api import Model
+import warnings
 
+from ...errors import Errors, Warnings
 from ...compat import Literal
 from ...errors import Errors
 from ...tokens import Doc
 from ...util import registry
-from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
+from ...tokens.doc import Doc
 
+TransitionSystem = Any  # TODO
+State = Any  # TODO
 
-@registry.architectures("spacy.TransitionBasedParser.v2")
-def build_tb_parser_model(
+
+@registry.architectures.register("spacy.TransitionBasedParser.v2")
+def transition_parser_v2(
     tok2vec: Model[List[Doc], List[Floats2d]],
     state_type: Literal["parser", "ner"],
     extra_state_tokens: bool,
@@ -20,6 +24,46 @@ def build_tb_parser_model(
     maxout_pieces: int,
     use_upper: bool,
     nO: Optional[int] = None,
+) -> Model:
+    if not use_upper:
+        warnings.warn(Warnings.W400)
+
+    return build_tb_parser_model(
+        tok2vec,
+        state_type,
+        extra_state_tokens,
+        hidden_width,
+        maxout_pieces,
+        nO=nO,
+    )
+
+
+@registry.architectures.register("spacy.TransitionBasedParser.v3")
+def transition_parser_v3(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    state_type: Literal["parser", "ner"],
+    extra_state_tokens: bool,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+) -> Model:
+    return build_tb_parser_model(
+        tok2vec,
+        state_type,
+        extra_state_tokens,
+        hidden_width,
+        maxout_pieces,
+        nO=nO,
+    )
+
+
+def build_tb_parser_model(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    state_type: Literal["parser", "ner"],
+    extra_state_tokens: bool,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
 ) -> Model:
     """
     Build a transition-based parser model. Can apply to NER or dependency-parsing.
@@ -52,14 +96,7 @@ def build_tb_parser_model(
         feature sets (for the NER) or 13 (for the parser).
     hidden_width (int): The width of the hidden layer.
     maxout_pieces (int): How many pieces to use in the state prediction layer.
-        Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
-        is replaced with a ReLu non-linearity if use_upper=True, and no
-        non-linearity if use_upper=False.
-    use_upper (bool): Whether to use an additional hidden layer after the state
-        vector in order to predict the action scores. It is recommended to set
-        this to False for large pretrained models such as transformers, and True
-        for smaller networks. The upper layer is computed on CPU, which becomes
-        a bottleneck on larger GPU-based models, where it's also less necessary.
+        Recommended values are 1, 2 or 3.
     nO (int or None): The number of actions the model will predict between.
         Usually inferred from data at the beginning of training, or loaded from
         disk.
@@ -70,106 +107,11 @@ def build_tb_parser_model(
         nr_feature_tokens = 6 if extra_state_tokens else 3
     else:
         raise ValueError(Errors.E917.format(value=state_type))
-    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec = chain(
-        tok2vec,
-        list2array(),
-        Linear(hidden_width, t2v_width),
+    return TransitionModel(
+        tok2vec=tok2vec,
+        state_tokens=nr_feature_tokens,
+        hidden_width=hidden_width,
+        maxout_pieces=maxout_pieces,
+        nO=nO,
+        unseen_classes=set(),
     )
-    tok2vec.set_dim("nO", hidden_width)
-    lower = _define_lower(
-        nO=hidden_width if use_upper else nO,
-        nF=nr_feature_tokens,
-        nI=tok2vec.get_dim("nO"),
-        nP=maxout_pieces,
-    )
-    upper = None
-    if use_upper:
-        with use_ops("cpu"):
-            # Initialize weights at zero, as it's a classification layer.
-            upper = _define_upper(nO=nO, nI=None)
-    return TransitionModel(tok2vec, lower, upper, resize_output)
-
-
-def _define_upper(nO, nI):
-    return Linear(nO=nO, nI=nI, init_W=zero_init)
-
-
-def _define_lower(nO, nF, nI, nP):
-    return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
-
-
-def resize_output(model, new_nO):
-    if model.attrs["has_upper"]:
-        return _resize_upper(model, new_nO)
-    return _resize_lower(model, new_nO)
-
-
-def _resize_upper(model, new_nO):
-    upper = model.get_ref("upper")
-    if upper.has_dim("nO") is None:
-        upper.set_dim("nO", new_nO)
-        return model
-    elif new_nO == upper.get_dim("nO"):
-        return model
-
-    smaller = upper
-    nI = smaller.maybe_get_dim("nI")
-    with use_ops("cpu"):
-        larger = _define_upper(nO=new_nO, nI=nI)
-    # it could be that the model is not initialized yet, then skip this bit
-    if smaller.has_param("W"):
-        larger_W = larger.ops.alloc2f(new_nO, nI)
-        larger_b = larger.ops.alloc1f(new_nO)
-        smaller_W = smaller.get_param("W")
-        smaller_b = smaller.get_param("b")
-        # Weights are stored in (nr_out, nr_in) format, so we're basically
-        # just adding rows here.
-        if smaller.has_dim("nO"):
-            old_nO = smaller.get_dim("nO")
-            larger_W[:old_nO] = smaller_W
-            larger_b[:old_nO] = smaller_b
-            for i in range(old_nO, new_nO):
-                model.attrs["unseen_classes"].add(i)
-
-        larger.set_param("W", larger_W)
-        larger.set_param("b", larger_b)
-    model._layers[-1] = larger
-    model.set_ref("upper", larger)
-    return model
-
-
-def _resize_lower(model, new_nO):
-    lower = model.get_ref("lower")
-    if lower.has_dim("nO") is None:
-        lower.set_dim("nO", new_nO)
-        return model
-
-    smaller = lower
-    nI = smaller.maybe_get_dim("nI")
-    nF = smaller.maybe_get_dim("nF")
-    nP = smaller.maybe_get_dim("nP")
-    larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
-    # it could be that the model is not initialized yet, then skip this bit
-    if smaller.has_param("W"):
-        larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
-        larger_b = larger.ops.alloc2f(new_nO, nP)
-        larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
-        smaller_W = smaller.get_param("W")
-        smaller_b = smaller.get_param("b")
-        smaller_pad = smaller.get_param("pad")
-        # Copy the old weights and padding into the new layer
-        if smaller.has_dim("nO"):
-            old_nO = smaller.get_dim("nO")
-            larger_W[:, 0:old_nO, :, :] = smaller_W
-            larger_pad[:, :, 0:old_nO, :] = smaller_pad
-            larger_b[0:old_nO, :] = smaller_b
-            for i in range(old_nO, new_nO):
-                model.attrs["unseen_classes"].add(i)
-
-        larger.set_param("W", larger_W)
-        larger.set_param("b", larger_b)
-        larger.set_param("pad", larger_pad)
-    model._layers[1] = larger
-    model.set_ref("lower", larger)
-    return model
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
deleted file mode 100644
index 4d2d7b3feeb..00000000000
--- a/spacy/ml/parser_model.pxd
+++ /dev/null
@@ -1,55 +0,0 @@
-from libc.string cimport memcpy, memset
-from thinc.backends.cblas cimport CBlas
-
-from ..pipeline._parser_internals._state cimport StateC
-from ..typedefs cimport hash_t, weight_t
-
-
-cdef struct SizesC:
-    int states
-    int classes
-    int hiddens
-    int pieces
-    int feats
-    int embed_width
-
-
-cdef struct WeightsC:
-    const float* feat_weights
-    const float* feat_bias
-    const float* hidden_bias
-    const float* hidden_weights
-    const float* seen_classes
-
-
-cdef struct ActivationsC:
-    int* token_ids
-    float* unmaxed
-    float* scores
-    float* hiddens
-    int* is_valid
-    int _curr_size
-    int _max_size
-
-
-cdef WeightsC get_c_weights(model) except *
-
-cdef SizesC get_c_sizes(model, int batch_size) except *
-
-cdef ActivationsC alloc_activations(SizesC n) nogil
-
-cdef void free_activations(const ActivationsC* A) nogil
-
-cdef void predict_states(
-    CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
-) nogil
-
-cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
-
-cdef void cpu_log_loss(
-    float* d_scores,
-    const float* costs,
-    const int* is_valid,
-    const float* scores,
-    int O
-) nogil
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
deleted file mode 100644
index 10a9f0bc485..00000000000
--- a/spacy/ml/parser_model.pyx
+++ /dev/null
@@ -1,539 +0,0 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
-# cython: profile=False
-cimport numpy as np
-from libc.math cimport exp
-from libc.stdlib cimport calloc, free, realloc
-from thinc.backends.cblas cimport saxpy, sgemm
-from thinc.backends.linalg cimport Vec, VecVec
-
-import numpy
-import numpy.random
-from thinc.api import CupyOps, Model, NumpyOps
-
-from .. import util
-from ..errors import Errors
-
-from ..pipeline._parser_internals.stateclass cimport StateClass
-from ..typedefs cimport weight_t
-
-
-cdef WeightsC get_c_weights(model) except *:
-    cdef WeightsC output
-    cdef precompute_hiddens state2vec = model.state2vec
-    output.feat_weights = state2vec.get_feat_weights()
-    output.feat_bias = <const float*>state2vec.bias.data
-    cdef np.ndarray vec2scores_W
-    cdef np.ndarray vec2scores_b
-    if model.vec2scores is None:
-        output.hidden_weights = NULL
-        output.hidden_bias = NULL
-    else:
-        vec2scores_W = model.vec2scores.get_param("W")
-        vec2scores_b = model.vec2scores.get_param("b")
-        output.hidden_weights = <const float*>vec2scores_W.data
-        output.hidden_bias = <const float*>vec2scores_b.data
-    cdef np.ndarray class_mask = model._class_mask
-    output.seen_classes = <const float*>class_mask.data
-    return output
-
-
-cdef SizesC get_c_sizes(model, int batch_size) except *:
-    cdef SizesC output
-    output.states = batch_size
-    if model.vec2scores is None:
-        output.classes = model.state2vec.get_dim("nO")
-    else:
-        output.classes = model.vec2scores.get_dim("nO")
-    output.hiddens = model.state2vec.get_dim("nO")
-    output.pieces = model.state2vec.get_dim("nP")
-    output.feats = model.state2vec.get_dim("nF")
-    output.embed_width = model.tokvecs.shape[1]
-    return output
-
-
-cdef ActivationsC alloc_activations(SizesC n) nogil:
-    cdef ActivationsC A
-    memset(&A, 0, sizeof(A))
-    resize_activations(&A, n)
-    return A
-
-
-cdef void free_activations(const ActivationsC* A) nogil:
-    free(A.token_ids)
-    free(A.scores)
-    free(A.unmaxed)
-    free(A.hiddens)
-    free(A.is_valid)
-
-
-cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
-    if n.states <= A._max_size:
-        A._curr_size = n.states
-        return
-    if A._max_size == 0:
-        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
-        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
-        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
-        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    else:
-        A.token_ids = <int*>realloc(
-            A.token_ids, n.states * n.feats * sizeof(A.token_ids[0])
-        )
-        A.scores = <float*>realloc(
-            A.scores, n.states * n.classes * sizeof(A.scores[0])
-        )
-        A.unmaxed = <float*>realloc(
-            A.unmaxed, n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])
-        )
-        A.hiddens = <float*>realloc(
-            A.hiddens, n.states * n.hiddens * sizeof(A.hiddens[0])
-        )
-        A.is_valid = <int*>realloc(
-            A.is_valid, n.states * n.classes * sizeof(A.is_valid[0])
-        )
-        A._max_size = n.states
-    A._curr_size = n.states
-
-
-cdef void predict_states(
-    CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
-) nogil:
-    resize_activations(A, n)
-    for i in range(n.states):
-        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
-    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
-    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
-    sum_state_features(
-        cblas,
-        A.unmaxed,
-        W.feat_weights,
-        A.token_ids,
-        n.states,
-        n.feats,
-        n.hiddens * n.pieces
-    )
-    for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
-        for j in range(n.hiddens):
-            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = _arg_max(&A.unmaxed[index], n.pieces)
-            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
-    memset(A.scores, 0, n.states * n.classes * sizeof(float))
-    if W.hidden_weights == NULL:
-        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
-    else:
-        # Compute hidden-to-output
-        sgemm(cblas)(
-            False, True, n.states, n.classes, n.hiddens,
-            1.0, <const float *>A.hiddens, n.hiddens,
-            <const float *>W.hidden_weights, n.hiddens,
-            0.0, A.scores, n.classes
-        )
-        # Add bias
-        for i in range(n.states):
-            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
-    # Set unseen classes to minimum value
-    i = 0
-    min_ = A.scores[0]
-    for i in range(1, n.states * n.classes):
-        if A.scores[i] < min_:
-            min_ = A.scores[i]
-    for i in range(n.states):
-        for j in range(n.classes):
-            if not W.seen_classes[j]:
-                A.scores[i*n.classes+j] = min_
-
-
-cdef void sum_state_features(
-    CBlas cblas,
-    float* output,
-    const float* cached,
-    const int* token_ids,
-    int B,
-    int F,
-    int O
-) nogil:
-    cdef int idx, b, f
-    cdef const float* feature
-    padding = cached
-    cached += F * O
-    cdef int id_stride = F*O
-    cdef float one = 1.
-    for b in range(B):
-        for f in range(F):
-            if token_ids[f] < 0:
-                feature = &padding[f*O]
-            else:
-                idx = token_ids[f] * id_stride + f*O
-                feature = &cached[idx]
-            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
-        token_ids += F
-
-
-cdef void cpu_log_loss(
-    float* d_scores,
-    const float* costs,
-    const int* is_valid,
-    const float* scores,
-    int O
-) nogil:
-    """Do multi-label log loss"""
-    cdef double max_, gmax, Z, gZ
-    best = arg_max_if_gold(scores, costs, is_valid, O)
-    guess = _arg_max(scores, O)
-
-    if best == -1 or guess == -1:
-        # These shouldn't happen, but if they do, we want to make sure we don't
-        # cause an OOB access.
-        return
-    Z = 1e-10
-    gZ = 1e-10
-    max_ = scores[guess]
-    gmax = scores[best]
-    for i in range(O):
-        Z += exp(scores[i] - max_)
-        if costs[i] <= costs[best]:
-            gZ += exp(scores[i] - gmax)
-    for i in range(O):
-        if costs[i] <= costs[best]:
-            d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
-        else:
-            d_scores[i] = exp(scores[i]-max_) / Z
-
-
-cdef int arg_max_if_gold(
-    const weight_t* scores, const weight_t* costs, const int* is_valid, int n
-) nogil:
-    # Find minimum cost
-    cdef float cost = 1
-    for i in range(n):
-        if is_valid[i] and costs[i] < cost:
-            cost = costs[i]
-    # Now find best-scoring with that cost
-    cdef int best = -1
-    for i in range(n):
-        if costs[i] <= cost and is_valid[i]:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
-
-
-cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
-    cdef int best = -1
-    for i in range(n):
-        if is_valid[i] >= 1:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
-
-
-class ParserStepModel(Model):
-    def __init__(
-        self,
-        docs,
-        layers,
-        *,
-        has_upper,
-        unseen_classes=None,
-        train=True,
-        dropout=0.1
-    ):
-        Model.__init__(self, name="parser_step_model", forward=step_forward)
-        self.attrs["has_upper"] = has_upper
-        self.attrs["dropout_rate"] = dropout
-        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
-        if layers[1].get_dim("nP") >= 2:
-            activation = "maxout"
-        elif has_upper:
-            activation = None
-        else:
-            activation = "relu"
-        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
-                                            activation=activation, train=train)
-        if has_upper:
-            self.vec2scores = layers[-1]
-        else:
-            self.vec2scores = None
-        self.cuda_stream = util.get_cuda_stream(non_blocking=True)
-        self.backprops = []
-        self._class_mask = numpy.zeros((self.nO,), dtype='f')
-        self._class_mask.fill(1)
-        if unseen_classes is not None:
-            for class_ in unseen_classes:
-                self._class_mask[class_] = 0.
-
-    def clear_memory(self):
-        del self.tokvecs
-        del self.bp_tokvecs
-        del self.state2vec
-        del self.backprops
-        del self._class_mask
-
-    @property
-    def nO(self):
-        if self.attrs["has_upper"]:
-            return self.vec2scores.get_dim("nO")
-        else:
-            return self.state2vec.get_dim("nO")
-
-    def class_is_unseen(self, class_):
-        return self._class_mask[class_]
-
-    def mark_class_unseen(self, class_):
-        self._class_mask[class_] = 0
-
-    def mark_class_seen(self, class_):
-        self._class_mask[class_] = 1
-
-    def get_token_ids(self, states):
-        cdef StateClass state
-        states = [state for state in states if not state.is_final()]
-        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
-                                          dtype='i', order='C')
-        ids.fill(-1)
-        c_ids = <int*>ids.data
-        for state in states:
-            state.c.set_context_tokens(c_ids, ids.shape[1])
-            c_ids += ids.shape[1]
-        return ids
-
-    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
-        if (
-            isinstance(self.state2vec.ops, CupyOps)
-            and not isinstance(token_ids, self.state2vec.ops.xp.ndarray)
-        ):
-            # Move token_ids and d_vector to GPU, asynchronously
-            self.backprops.append((
-                util.get_async(self.cuda_stream, token_ids),
-                util.get_async(self.cuda_stream, d_vector),
-                get_d_tokvecs
-            ))
-        else:
-            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
-
-    def finish_steps(self, golds):
-        # Add a padding vector to the d_tokvecs gradient, so that missing
-        # values don't affect the real gradient.
-        d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
-        # Tells CUDA to block, so our async copies complete.
-        if self.cuda_stream is not None:
-            self.cuda_stream.synchronize()
-        for ids, d_vector, bp_vector in self.backprops:
-            d_state_features = bp_vector((d_vector, ids))
-            ids = ids.flatten()
-            d_state_features = d_state_features.reshape(
-                (ids.size, d_state_features.shape[2]))
-            self.ops.scatter_add(d_tokvecs, ids, d_state_features)
-        # Padded -- see update()
-        self.bp_tokvecs(d_tokvecs[:-1])
-        return d_tokvecs
-
-
-NUMPY_OPS = NumpyOps()
-
-
-def step_forward(model: ParserStepModel, states, is_train):
-    token_ids = model.get_token_ids(states)
-    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
-    mask = None
-    if model.attrs["has_upper"]:
-        dropout_rate = model.attrs["dropout_rate"]
-        if is_train and dropout_rate > 0:
-            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
-            vector *= mask
-        scores, get_d_vector = model.vec2scores(vector, is_train)
-    else:
-        scores = NumpyOps().asarray(vector)
-        get_d_vector = lambda d_scores: d_scores  # no-cython-lint: E731
-    # If the class is unseen, make sure its score is minimum
-    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
-
-    def backprop_parser_step(d_scores):
-        # Zero vectors for unseen classes
-        d_scores *= model._class_mask
-        d_vector = get_d_vector(d_scores)
-        if mask is not None:
-            d_vector *= mask
-        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
-        return None
-    return scores, backprop_parser_step
-
-
-cdef class precompute_hiddens:
-    """Allow a model to be "primed" by pre-computing input features in bulk.
-
-    This is used for the parser, where we want to take a batch of documents,
-    and compute vectors for each (token, position) pair. These vectors can then
-    be reused, especially for beam-search.
-
-    Let's say we're using 12 features for each state, e.g. word at start of
-    buffer, three words on stack, their children, etc. In the normal arc-eager
-    system, a document of length N is processed in 2*N states. This means we'll
-    create 2*N*12 feature vectors --- but if we pre-compute, we only need
-    N*12 vector computations. The saving for beam-search is much better:
-    if we have a beam of k, we'll normally make 2*N*12*K computations --
-    so we can save the factor k. This also gives a nice CPU/GPU division:
-    we can do all our hard maths up front, packed into large multiplications,
-    and do the hard-to-program parsing on the CPU.
-    """
-    cdef readonly int nF, nO, nP
-    cdef bint _is_synchronized
-    cdef public object ops
-    cdef public object numpy_ops
-    cdef public object _cpu_ops
-    cdef np.ndarray _features
-    cdef np.ndarray _cached
-    cdef np.ndarray bias
-    cdef object _cuda_stream
-    cdef object _bp_hiddens
-    cdef object activation
-
-    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
-                 activation="maxout", train=False):
-        gpu_cached, bp_features = lower_model(tokvecs, train)
-        cdef np.ndarray cached
-        if not isinstance(gpu_cached, numpy.ndarray):
-            # Note the passing of cuda_stream here: it lets
-            # cupy make the copy asynchronously.
-            # We then have to block before first use.
-            cached = gpu_cached.get(stream=cuda_stream)
-        else:
-            cached = gpu_cached
-        if not isinstance(lower_model.get_param("b"), numpy.ndarray):
-            self.bias = lower_model.get_param("b").get(stream=cuda_stream)
-        else:
-            self.bias = lower_model.get_param("b")
-        self.nF = cached.shape[1]
-        if lower_model.has_dim("nP"):
-            self.nP = lower_model.get_dim("nP")
-        else:
-            self.nP = 1
-        self.nO = cached.shape[2]
-        self.ops = lower_model.ops
-        self.numpy_ops = NumpyOps()
-        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
-        assert activation in (None, "relu", "maxout")
-        self.activation = activation
-        self._is_synchronized = False
-        self._cuda_stream = cuda_stream
-        self._cached = cached
-        self._bp_hiddens = bp_features
-
-    cdef const float* get_feat_weights(self) except NULL:
-        if not self._is_synchronized and self._cuda_stream is not None:
-            self._cuda_stream.synchronize()
-            self._is_synchronized = True
-        return <float*>self._cached.data
-
-    def has_dim(self, name):
-        if name == "nF":
-            return self.nF if self.nF is not None else True
-        elif name == "nP":
-            return self.nP if self.nP is not None else True
-        elif name == "nO":
-            return self.nO if self.nO is not None else True
-        else:
-            return False
-
-    def get_dim(self, name):
-        if name == "nF":
-            return self.nF
-        elif name == "nP":
-            return self.nP
-        elif name == "nO":
-            return self.nO
-        else:
-            raise ValueError(Errors.E1033.format(name=name))
-
-    def set_dim(self, name, value):
-        if name == "nF":
-            self.nF = value
-        elif name == "nP":
-            self.nP = value
-        elif name == "nO":
-            self.nO = value
-        else:
-            raise ValueError(Errors.E1033.format(name=name))
-
-    def __call__(self, X, bint is_train):
-        if is_train:
-            return self.begin_update(X)
-        else:
-            return self.predict(X), lambda X: X
-
-    def predict(self, X):
-        return self.begin_update(X)[0]
-
-    def begin_update(self, token_ids):
-        cdef np.ndarray state_vector = numpy.zeros(
-            (token_ids.shape[0], self.nO, self.nP), dtype='f')
-        # This is tricky, but (assuming GPU available);
-        # - Input to forward on CPU
-        # - Output from forward on CPU
-        # - Input to backward on GPU!
-        # - Output from backward on GPU
-        bp_hiddens = self._bp_hiddens
-
-        cdef CBlas cblas = self._cpu_ops.cblas()
-
-        feat_weights = self.get_feat_weights()
-        cdef int[:, ::1] ids = token_ids
-        sum_state_features(
-            cblas, <float*>state_vector.data,
-            feat_weights, &ids[0, 0],
-            token_ids.shape[0], self.nF, self.nO*self.nP
-        )
-        state_vector += self.bias
-        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
-
-        def backward(d_state_vector_ids):
-            d_state_vector, token_ids = d_state_vector_ids
-            d_state_vector = bp_nonlinearity(d_state_vector)
-            d_tokens = bp_hiddens((d_state_vector, token_ids))
-            return d_tokens
-        return state_vector, backward
-
-    def _nonlinearity(self, state_vector):
-        if self.activation == "maxout":
-            return self._maxout_nonlinearity(state_vector)
-        else:
-            return self._relu_nonlinearity(state_vector)
-
-    def _maxout_nonlinearity(self, state_vector):
-        state_vector, mask = self.numpy_ops.maxout(state_vector)
-        # We're outputting to CPU, but we need this variable on GPU for the
-        # backward pass.
-        mask = self.ops.asarray(mask)
-
-        def backprop_maxout(d_best):
-            return self.ops.backprop_maxout(d_best, mask, self.nP)
-
-        return state_vector, backprop_maxout
-
-    def _relu_nonlinearity(self, state_vector):
-        state_vector = state_vector.reshape((state_vector.shape[0], -1))
-        mask = state_vector >= 0.
-        state_vector *= mask
-        # We're outputting to CPU, but we need this variable on GPU for the
-        # backward pass.
-        mask = self.ops.asarray(mask)
-
-        def backprop_relu(d_best):
-            d_best *= mask
-            return d_best.reshape((d_best.shape + (1,)))
-
-        return state_vector, backprop_relu
-
-cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
-    if n_classes == 2:
-        return 0 if scores[0] > scores[1] else 1
-    cdef int i
-    cdef int best = 0
-    cdef float mode = scores[0]
-    for i in range(1, n_classes):
-        if scores[i] > mode:
-            mode = scores[i]
-            best = i
-    return best
diff --git a/spacy/ml/tb_framework.pxd b/spacy/ml/tb_framework.pxd
new file mode 100644
index 00000000000..965508519e8
--- /dev/null
+++ b/spacy/ml/tb_framework.pxd
@@ -0,0 +1,28 @@
+from libc.stdint cimport int8_t
+
+
+cdef struct SizesC:
+    int states
+    int classes
+    int hiddens
+    int pieces
+    int feats
+    int embed_width
+    int tokens
+
+
+cdef struct WeightsC:
+    const float* feat_weights
+    const float* feat_bias
+    const float* hidden_bias
+    const float* hidden_weights
+    const int8_t* seen_mask
+
+
+cdef struct ActivationsC:
+    int* token_ids
+    float* unmaxed
+    float* hiddens
+    int* is_valid
+    int _curr_size
+    int _max_size
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
deleted file mode 100644
index e351ad4e570..00000000000
--- a/spacy/ml/tb_framework.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from thinc.api import Model, noop
-
-from ..util import registry
-from .parser_model import ParserStepModel
-
-
-@registry.layers("spacy.TransitionModel.v1")
-def TransitionModel(
-    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
-):
-    """Set up a stepwise transition-based model"""
-    if upper is None:
-        has_upper = False
-        upper = noop()
-    else:
-        has_upper = True
-    # don't define nO for this object, because we can't dynamically change it
-    return Model(
-        name="parser_model",
-        forward=forward,
-        dims={"nI": tok2vec.maybe_get_dim("nI")},
-        layers=[tok2vec, lower, upper],
-        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
-        init=init,
-        attrs={
-            "has_upper": has_upper,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def forward(model, X, is_train):
-    step_model = ParserStepModel(
-        X,
-        model.layers,
-        unseen_classes=model.attrs["unseen_classes"],
-        train=is_train,
-        has_upper=model.attrs["has_upper"],
-    )
-
-    return step_model, step_model.finish_steps
-
-
-def init(model, X=None, Y=None):
-    model.get_ref("tok2vec").initialize(X=X)
-    lower = model.get_ref("lower")
-    lower.initialize()
-    if model.attrs["has_upper"]:
-        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
-        model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
new file mode 100644
index 00000000000..79be13b00bd
--- /dev/null
+++ b/spacy/ml/tb_framework.pyx
@@ -0,0 +1,621 @@
+# cython: infer_types=True, cdivision=True, boundscheck=False
+from typing import List, Tuple, Any, Optional, TypeVar, cast
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from libcpp.vector cimport vector
+import numpy
+cimport numpy as np
+from thinc.api import Model, normal_init, chain, list2array, Linear
+from thinc.api import uniform_init, glorot_uniform_init, zero_init
+from thinc.api import NumpyOps
+from thinc.backends.cblas cimport CBlas, saxpy, sgemm
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
+from thinc.types import Ints1d, Ints2d
+
+from ..errors import Errors
+from ..pipeline._parser_internals import _beam_utils
+from ..pipeline._parser_internals.batch import GreedyBatch
+from ..pipeline._parser_internals._parser_utils cimport arg_max
+from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
+from ..pipeline._parser_internals.transition_system cimport TransitionSystem
+from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..tokens.doc import Doc
+from ..util import registry
+
+
+State = Any  # TODO
+
+
+@registry.layers("spacy.TransitionModel.v2")
+def TransitionModel(
+    *,
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    beam_width: int = 1,
+    beam_density: float = 0.0,
+    state_tokens: int,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+    unseen_classes=set(),
+) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
+    """Set up a transition-based parsing model, using a maxout hidden
+    layer and a linear output layer.
+    """
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
+    tok2vec_projected.set_dim("nO", hidden_width)
+
+    # FIXME: we use `output` as a container for the output layer's
+    # weights and biases. Thinc optimizers cannot handle resizing
+    # of parameters. So, when the parser model is resized, we
+    # construct a new `output` layer, which has a different key in
+    # the optimizer. Once the optimizer supports parameter resizing,
+    # we can replace the `output` layer by `output_W` and `output_b`
+    # parameters in this model.
+    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
+
+    return Model(
+        name="parser_model",
+        forward=forward,
+        init=init,
+        layers=[tok2vec_projected, output],
+        refs={
+            "tok2vec": tok2vec_projected,
+            "output": output,
+        },
+        params={
+            "hidden_W": None,  # Floats2d W for the hidden layer
+            "hidden_b": None,  # Floats1d bias for the hidden layer
+            "hidden_pad": None,  # Floats1d padding for the hidden layer
+        },
+        dims={
+            "nO": None,  # Output size
+            "nP": maxout_pieces,
+            "nH": hidden_width,
+            "nI": tok2vec_projected.maybe_get_dim("nO"),
+            "nF": state_tokens,
+        },
+        attrs={
+            "beam_width": beam_width,
+            "beam_density": beam_density,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def resize_output(model: Model, new_nO: int) -> Model:
+    old_nO = model.maybe_get_dim("nO")
+    output = model.get_ref("output")
+    if old_nO is None:
+        model.set_dim("nO", new_nO)
+        output.set_dim("nO", new_nO)
+        output.initialize()
+        return model
+    elif new_nO <= old_nO:
+        return model
+    elif output.has_param("W"):
+        nH = model.get_dim("nH")
+        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
+        new_output.initialize()
+        new_W = new_output.get_param("W")
+        new_b = new_output.get_param("b")
+        old_W = output.get_param("W")
+        old_b = output.get_param("b")
+        new_W[:old_nO] = old_W  # type: ignore
+        new_b[:old_nO] = old_b  # type: ignore
+        for i in range(old_nO, new_nO):
+            model.attrs["unseen_classes"].add(i)
+        model.layers[-1] = new_output
+        model.set_ref("output", new_output)
+    # TODO: Avoid this private intrusion
+    model._dims["nO"] = new_nO
+    return model
+
+
+def init(
+    model,
+    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
+    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
+):
+    if X is not None:
+        docs, moves = X
+        model.get_ref("tok2vec").initialize(X=docs)
+    else:
+        model.get_ref("tok2vec").initialize()
+    inferred_nO = _infer_nO(Y)
+    if inferred_nO is not None:
+        current_nO = model.maybe_get_dim("nO")
+        if current_nO is None or current_nO != inferred_nO:
+            model.attrs["resize_output"](model, inferred_nO)
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nH = model.get_dim("nH")
+    nI = model.get_dim("nI")
+    nF = model.get_dim("nF")
+    ops = model.ops
+
+    Wl = ops.alloc2f(nH * nP, nF * nI)
+    bl = ops.alloc1f(nH * nP)
+    padl = ops.alloc1f(nI)
+    # Wl = zero_init(ops, Wl.shape)
+    Wl = glorot_uniform_init(ops, Wl.shape)
+    padl = uniform_init(ops, padl.shape)  # type: ignore
+    # TODO: Experiment with whether better to initialize output_W
+    model.set_param("hidden_W", Wl)
+    model.set_param("hidden_b", bl)
+    model.set_param("hidden_pad", padl)
+    # model = _lsuv_init(model)
+    return model
+
+
+class TransitionModelInputs:
+    """
+    Input to transition model.
+    """
+
+    # dataclass annotation is not yet supported in Cython 0.29.x,
+    # so, we'll do something close to it.
+
+    actions: Optional[List[Ints1d]]
+    docs: List[Doc]
+    max_moves: int
+    moves: TransitionSystem
+    states: Optional[List[State]]
+
+    __slots__ = [
+        "actions",
+        "docs",
+        "max_moves",
+        "moves",
+        "states",
+    ]
+
+    def __init__(
+        self,
+        docs: List[Doc],
+        moves: TransitionSystem,
+        actions: Optional[List[Ints1d]]=None,
+        max_moves: int=0,
+        states: Optional[List[State]]=None):
+        """
+        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
+        docs (List[Doc]): Docs to predict transition sequences for.
+        max_moves: (int): the maximum number of moves to apply, values less
+            than 1 will apply moves to states until they are final states.
+        moves (TransitionSystem): the transition system to use when predicting
+            the transition sequences.
+        states (Optional[List[States]]): the initial states to predict the
+            transition sequences for. When absent, the initial states are
+            initialized from the provided Docs.
+        """
+        self.actions = actions
+        self.docs = docs
+        self.moves = moves
+        self.max_moves = max_moves
+        self.states = states
+
+
+def forward(model, inputs: TransitionModelInputs, is_train: bool):
+    docs = inputs.docs
+    moves = inputs.moves
+    actions = inputs.actions
+
+    beam_width = model.attrs["beam_width"]
+    hidden_pad = model.get_param("hidden_pad")
+    tok2vec = model.get_ref("tok2vec")
+
+    states = moves.init_batch(docs) if inputs.states is None else inputs.states
+    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
+    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
+    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
+    seen_mask = _get_seen_mask(model)
+
+    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
+        # Note: max_moves is only used during training, so we don't need to
+        #       pass it to the greedy inference path.
+        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
+    else:
+        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
+            feats, backprop_feats, seen_mask, is_train, actions=actions,
+            max_moves=inputs.max_moves)
+
+
+def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
+                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+    cdef vector[StateC*] c_states
+    cdef StateClass state
+    for state in states:
+        if not state.is_final():
+            c_states.push_back(state.c)
+    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
+    # Precomputed features have rows for each token, plus one for padding.
+    cdef int n_tokens = feats.shape[0] - 1
+    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
+    cdef CBlas cblas = model.ops.cblas()
+    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
+
+    def backprop(dY):
+        raise ValueError(Errors.E4004)
+
+    return (states, scores), backprop
+
+cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
+                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
+    cdef int i, j
+    cdef vector[StateC *] unfinished
+    cdef ActivationsC activations = _alloc_activations(sizes)
+    cdef np.ndarray step_scores
+    cdef np.ndarray step_actions
+
+    scores = []
+    while sizes.states >= 1:
+        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
+        step_actions = actions[0] if actions is not None else None
+        with nogil:
+            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
+            if actions is None:
+                # Validate actions, argmax, take action.
+                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
+                    sizes.states)
+            else:
+                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
+            for i in range(sizes.states):
+                if not states[i].is_final():
+                    unfinished.push_back(states[i])
+            for i in range(unfinished.size()):
+                states[i] = unfinished[i]
+        sizes.states = unfinished.size()
+        scores.append(step_scores)
+        unfinished.clear()
+        actions = actions[1:] if actions is not None else None
+    _free_activations(&activations)
+
+    return scores
+
+
+def _forward_fallback(
+    model: Model,
+    moves: TransitionSystem,
+    states: List[StateClass],
+    tokvecs, backprop_tok2vec,
+    feats,
+    backprop_feats,
+    seen_mask,
+    is_train: bool,
+    actions: Optional[List[Ints1d]]=None,
+    max_moves: int=0):
+    nF = model.get_dim("nF")
+    output = model.get_ref("output")
+    hidden_b = model.get_param("hidden_b")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+
+    beam_width = model.attrs["beam_width"]
+    beam_density = model.attrs["beam_density"]
+
+    ops = model.ops
+
+    all_ids = []
+    all_which = []
+    all_statevecs = []
+    all_scores = []
+    if beam_width == 1:
+        batch = GreedyBatch(moves, states, None)
+    else:
+        batch = _beam_utils.BeamBatch(
+            moves, states, None, width=beam_width, density=beam_density
+        )
+    arange = ops.xp.arange(nF)
+    n_moves = 0
+    while not batch.is_done:
+        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
+        for i, state in enumerate(batch.get_unfinished_states()):
+            state.set_context_tokens(ids, i, nF)
+        # Sum the state features, add the bias and apply the activation (maxout)
+        # to create the state vectors.
+        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
+        preacts2f += hidden_b
+        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
+        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
+        statevecs, which = ops.maxout(preacts)
+        # We don't use output's backprop, since we want to backprop for
+        # all states at once, rather than a single state.
+        scores = output.predict(statevecs)
+        scores[:, seen_mask] = ops.xp.nanmin(scores)
+        # Transition the states, filtering out any that are finished.
+        cpu_scores = ops.to_numpy(scores)
+        if actions is None:
+            batch.advance(cpu_scores)
+        else:
+            batch.advance_with_actions(actions[0])
+            actions = actions[1:]
+        all_scores.append(scores)
+        if is_train:
+            # Remember intermediate results for the backprop.
+            all_ids.append(ids)
+            all_statevecs.append(statevecs)
+            all_which.append(which)
+        if n_moves >= max_moves >= 1:
+            break
+        n_moves += 1
+
+    def backprop_parser(d_states_d_scores):
+        ids = ops.xp.vstack(all_ids)
+        which = ops.xp.vstack(all_which)
+        statevecs = ops.xp.vstack(all_statevecs)
+        _, d_scores = d_states_d_scores
+        if model.attrs.get("unseen_classes"):
+            # If we have a negative gradient (i.e. the probability should
+            # increase) on any classes we filtered out as unseen, mark
+            # them as seen.
+            for clas in set(model.attrs["unseen_classes"]):
+                if (d_scores[:, clas] < 0).any():
+                    model.attrs["unseen_classes"].remove(clas)
+        d_scores *= seen_mask == False
+        # Calculate the gradients for the parameters of the output layer.
+        # The weight gemm is (nS, nO) @ (nS, nH).T
+        output.inc_grad("b", d_scores.sum(axis=0))
+        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
+        # Now calculate d_statevecs, by backproping through the output linear layer.
+        # This gemm is (nS, nO) @ (nO, nH)
+        output_W = output.get_param("W")
+        d_statevecs = ops.gemm(d_scores, output_W)
+        # Backprop through the maxout activation
+        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
+        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
+        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
+        # We don't need to backprop the summation, because we pass back the IDs instead
+        d_state_features = backprop_feats((d_preacts2f, ids))
+        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
+        ops.scatter_add(d_tokvecs, ids, d_state_features)
+        model.inc_grad("hidden_pad", d_tokvecs[-1])
+        return (backprop_tok2vec(d_tokvecs[:-1]), None)
+
+    return (list(batch), all_scores), backprop_parser
+
+
+def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
+    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
+    for class_ in model.attrs.get("unseen_classes", set()):
+        mask[class_] = True
+    return mask
+
+
+def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
+    W: Floats2d = model.get_param("hidden_W")
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
+    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
+    W3f = W3f.transpose((1, 0, 2))
+    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
+    assert X.shape == (X.shape[0], nI), X.shape
+    Yf_ = model.ops.gemm(X, W2f, trans2=True)
+    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
+
+    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nH, nP), and get back:
+        # (nB, nH, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nH, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        dXf = model.ops.gemm(dY, W)
+        Xf = X[ids].reshape((ids.shape[0], -1))
+        dW = model.ops.gemm(dY, Xf, trans1=True)
+        model.inc_grad("hidden_W", dW)
+        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
+
+    return Yf, backward
+
+
+def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
+    if Y is None:
+        return None
+    _, scores = Y
+    if len(scores) == 0:
+        return None
+    assert scores[0].shape[0] >= 1
+    assert len(scores[0].shape) == 2
+    return scores[0].shape[1]
+
+
+def _lsuv_init(model: Model):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    W = model.maybe_get_param("hidden_W")
+    if W is not None and W.any():
+        return
+
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nH, nP, nI)
+    b = model.ops.alloc2f(nH, nP)
+    pad = model.ops.alloc4f(1, nF, nH, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc_f((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc_f((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
+        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
+        vectors3f += b
+        return model.ops.maxout(vectors3f)[0]
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = cast(Floats4d, model.get_param("hidden_W").copy())
+    b = cast(Floats2d, model.get_param("hidden_b").copy())
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("hidden_W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("hidden_b", b)
+        else:
+            break
+    return model
+
+
+cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
+    output = model.get_ref("output")
+    cdef np.ndarray hidden_b = model.get_param("hidden_b")
+    cdef np.ndarray output_W = output.get_param("W")
+    cdef np.ndarray output_b = output.get_param("b")
+
+    cdef WeightsC weights
+    weights.feat_weights = feats
+    weights.feat_bias = <const float*>hidden_b.data
+    weights.hidden_weights = <const float *> output_W.data
+    weights.hidden_bias = <const float *> output_b.data
+    weights.seen_mask = <const int8_t*> seen_mask.data
+
+    return weights
+
+
+cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
+    cdef SizesC sizes
+    sizes.states = batch_size
+    sizes.classes = model.get_dim("nO")
+    sizes.hiddens = model.get_dim("nH")
+    sizes.pieces = model.get_dim("nP")
+    sizes.feats = model.get_dim("nF")
+    sizes.embed_width = model.get_dim("nI")
+    sizes.tokens = tokens
+    return sizes
+
+
+cdef ActivationsC _alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    _resize_activations(&A, n)
+    return A
+
+
+cdef void _free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
+cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
+    if n.states <= A._max_size:
+        A._curr_size = n.states
+        return
+    if A._max_size == 0:
+        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
+        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    else:
+        A.token_ids = <int*>realloc(A.token_ids,
+            n.states * n.feats * sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,
+            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>realloc(A.hiddens,
+            n.states * n.hiddens * sizeof(A.hiddens[0]))
+        A.is_valid = <int*>realloc(A.is_valid,
+            n.states * n.classes * sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    A._curr_size = n.states
+
+
+cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
+    _resize_activations(A, n)
+    for i in range(n.states):
+        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
+    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
+    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
+    for i in range(n.states):
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        for j in range(n.hiddens):
+            index = i * n.hiddens * n.pieces + j * n.pieces
+            which = arg_max(&A.unmaxed[index], n.pieces)
+            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
+    if W.hidden_weights == NULL:
+        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
+    else:
+        # Compute hidden-to-output
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
+                      1.0, <const float *>A.hiddens, n.hiddens,
+                      <const float *>W.hidden_weights, n.hiddens,
+                      0.0, scores, n.classes)
+        # Add bias
+        for i in range(n.states):
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
+    # Set unseen classes to minimum value
+    i = 0
+    min_ = scores[0]
+    for i in range(1, n.states * n.classes):
+        if scores[i] < min_:
+            min_ = scores[i]
+    for i in range(n.states):
+        for j in range(n.classes):
+            if W.seen_mask[j]:
+                scores[i*n.classes+j] = min_
+
+
+cdef void _sum_state_features(CBlas cblas, float* output,
+        const float* cached, const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    cdef int B = n.states
+    cdef int O = n.hiddens * n.pieces
+    cdef int F = n.feats
+    cdef int T = n.tokens
+    padding = cached + (T * F * O)
+    cdef int id_stride = F*O
+    cdef float one = 1.
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                feature = &padding[f*O]
+            else:
+                idx = token_ids[f] * id_stride + f*O
+                feature = &cached[idx]
+            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
+        token_ids += F
+
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index d004d313c3e..c86de231d09 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -6,6 +6,7 @@ from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
+from .batch cimport Batch
 from .search cimport Beam, MaxViolation
 from .search import MaxViolation
 from .stateclass cimport StateC, StateClass
@@ -25,7 +26,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1:
     return state.is_final()
 
 
-cdef class BeamBatch(object):
+cdef class BeamBatch(Batch):
     cdef public TransitionSystem moves
     cdef public object states
     cdef public object docs
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pxd b/spacy/pipeline/_parser_internals/_parser_utils.pxd
new file mode 100644
index 00000000000..7fee05bad60
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pxd
@@ -0,0 +1,2 @@
+cdef int arg_max(const float* scores, const int n_classes) nogil
+cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pyx b/spacy/pipeline/_parser_internals/_parser_utils.pyx
new file mode 100644
index 00000000000..582756bf5be
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pyx
@@ -0,0 +1,22 @@
+# cython: infer_types=True
+
+cdef inline int arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
+cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if is_valid[i] >= 1:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index c063cf97cd4..1b6b25e5f16 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -8,6 +8,7 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
@@ -27,7 +28,7 @@ cdef struct ArcC:
 
 
 cdef cppclass StateC:
-    int* _heads
+    vector[int] _heads
     const TokenC* _sent
     vector[int] _stack
     vector[int] _rebuffer
@@ -35,31 +36,34 @@ cdef cppclass StateC:
     unordered_map[int, vector[ArcC]] _left_arcs
     unordered_map[int, vector[ArcC]] _right_arcs
     vector[libcpp.bool] _unshiftable
+    vector[int] history
     set[int] _sent_starts
     TokenC _empty_token
     int length
     int offset
     int _b_i
 
-    __init__(const TokenC* sent, int length) nogil:
+    __init__(const TokenC* sent, int length) nogil except +:
+        this._heads.resize(length, -1)
+        this._unshiftable.resize(length, False)
+
+        # Reserve memory ahead of time to minimize allocations during parsing.
+        # The initial capacity set here ideally reflects the expected average-case/majority usage.
+        cdef int init_capacity = 32
+        this._stack.reserve(init_capacity)
+        this._rebuffer.reserve(init_capacity)
+        this._ents.reserve(init_capacity)
+        this._left_arcs.reserve(init_capacity)
+        this._right_arcs.reserve(init_capacity)
+        this.history.reserve(init_capacity)
+
         this._sent = sent
-        this._heads = <int*>calloc(length, sizeof(int))
-        if not (this._sent and this._heads):
-            with gil:
-                PyErr_SetFromErrno(MemoryError)
-                PyErr_CheckSignals()
         this.offset = 0
         this.length = length
         this._b_i = 0
-        for i in range(length):
-            this._heads[i] = -1
-            this._unshiftable.push_back(0)
         memset(&this._empty_token, 0, sizeof(TokenC))
         this._empty_token.lex = &EMPTY_LEXEME
 
-    __dealloc__():
-        free(this._heads)
-
     void set_context_tokens(int* ids, int n) nogil:
         cdef int i, j
         if n == 1:
@@ -132,19 +136,20 @@ cdef cppclass StateC:
                 ids[i] = -1
 
     int S(int i) nogil const:
-        if i >= this._stack.size():
+        cdef int stack_size = this._stack.size()
+        if i >= stack_size or i < 0:
             return -1
-        elif i < 0:
-            return -1
-        return this._stack.at(this._stack.size() - (i+1))
+        else:
+            return this._stack[stack_size - (i+1)]
 
     int B(int i) nogil const:
+        cdef int buf_size = this._rebuffer.size()
         if i < 0:
             return -1
-        elif i < this._rebuffer.size():
-            return this._rebuffer.at(this._rebuffer.size() - (i+1))
+        elif i < buf_size:
+            return this._rebuffer[buf_size - (i+1)]
         else:
-            b_i = this._b_i + (i - this._rebuffer.size())
+            b_i = this._b_i + (i - buf_size)
             if b_i >= this.length:
                 return -1
             else:
@@ -243,7 +248,7 @@ cdef cppclass StateC:
             return 0
         elif this._sent[word].sent_start == 1:
             return 1
-        elif this._sent_starts.count(word) >= 1:
+        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
             return 1
         else:
             return 0
@@ -327,7 +332,7 @@ cdef cppclass StateC:
         if item >= this._unshiftable.size():
             return 0
         else:
-            return this._unshiftable.at(item)
+            return this._unshiftable[item]
 
     void set_reshiftable(int item) nogil:
         if item < this._unshiftable.size():
@@ -347,6 +352,9 @@ cdef cppclass StateC:
         this._heads[child] = head
 
     void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
+        cdef vector[ArcC]* arcs
+        cdef ArcC* arc
+
         arcs_it = heads_arcs.find(h_i)
         if arcs_it == heads_arcs.end():
             return
@@ -355,12 +363,12 @@ cdef cppclass StateC:
         if arcs.size() == 0:
             return
 
-        arc = arcs.back()
+        arc = &arcs.back()
         if arc.head == h_i and arc.child == c_i:
             arcs.pop_back()
         else:
             for i in range(arcs.size()-1):
-                arc = arcs.at(i)
+                arc = &deref(arcs)[i]
                 if arc.head == h_i and arc.child == c_i:
                     arc.head = -1
                     arc.child = -1
@@ -400,10 +408,11 @@ cdef cppclass StateC:
         this._rebuffer = src._rebuffer
         this._sent_starts = src._sent_starts
         this._unshiftable = src._unshiftable
-        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
+        this._heads = src._heads
         this._ents = src._ents
         this._left_arcs = src._left_arcs
         this._right_arcs = src._right_arcs
         this._b_i = src._b_i
         this.offset = src.offset
         this._empty_token = src._empty_token
+        this.history = src.history
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 10f2649baa0..673e36bf5ac 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -778,6 +778,8 @@ cdef class ArcEager(TransitionSystem):
         return list(arcs)
 
     def has_gold(self, Example eg, start=0, end=None):
+        if end is not None and end < 0:
+            end = None
         for word in eg.y[start:end]:
             if word.dep != 0:
                 return True
@@ -862,6 +864,7 @@ cdef class ArcEager(TransitionSystem):
                             state.print_state()
                         )))
                     action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                     break
             else:
                 failed = False
diff --git a/spacy/pipeline/_parser_internals/batch.pxd b/spacy/pipeline/_parser_internals/batch.pxd
new file mode 100644
index 00000000000..60734e549aa
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/batch.pxd
@@ -0,0 +1,2 @@
+cdef class Batch:
+    pass
diff --git a/spacy/pipeline/_parser_internals/batch.pyx b/spacy/pipeline/_parser_internals/batch.pyx
new file mode 100644
index 00000000000..91073b52e68
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/batch.pyx
@@ -0,0 +1,52 @@
+from typing import Any
+
+TransitionSystem = Any  # TODO
+
+cdef class Batch:
+    def advance(self, scores):
+        raise NotImplementedError
+
+    def get_states(self):
+        raise NotImplementedError
+
+    @property
+    def is_done(self):
+        raise NotImplementedError
+
+    def get_unfinished_states(self):
+        raise NotImplementedError
+
+    def __getitem__(self, i):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+
+class GreedyBatch(Batch):
+    def __init__(self, moves: TransitionSystem, states, golds):
+        self._moves = moves
+        self._states = states
+        self._next_states = [s for s in states if not s.is_final()]
+
+    def advance(self, scores):
+        self._next_states = self._moves.transition_states(self._next_states, scores)
+
+    def advance_with_actions(self, actions):
+        self._next_states = self._moves.apply_actions(self._next_states, actions)
+
+    def get_states(self):
+        return self._states
+
+    @property
+    def is_done(self):
+        return all(s.is_final() for s in self._states)
+
+    def get_unfinished_states(self):
+        return [st for st in self._states if not st.is_final()]
+
+    def __getitem__(self, i):
+        return self._states[i]
+
+    def __len__(self):
+        return len(self._states)
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 6851f9f2096..cf19c834ed9 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -311,6 +311,8 @@ cdef class BiluoPushDown(TransitionSystem):
             for span in eg.y.spans.get(neg_key, []):
                 if span.start >= start and span.end <= end:
                     return True
+        if end is not None and end < 0:
+            end = None
         for word in eg.y[start:end]:
             if word.ent_iob != 0:
                 return True
@@ -648,6 +650,7 @@ cdef class Unit:
         return cost
 
 
+
 cdef class Out:
     @staticmethod
     cdef bint is_valid(const StateC* st, attr_t label) nogil:
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index 24b9f1adc33..e49ff63c48b 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -20,6 +20,10 @@ cdef class StateClass:
         if self._borrowed != 1:
             del self.c
 
+    @property
+    def history(self):
+        return list(self.c.history)
+
     @property
     def stack(self):
         return [self.S(i) for i in range(self.c.stack_depth())]
@@ -176,3 +180,6 @@ cdef class StateClass:
 
     def clone(self, StateClass src):
         self.c.clone(src.c)
+
+    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
+        self.c.set_context_tokens(&output[row, 0], n_feats)
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 04cd10d8864..66cc7747b69 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -57,3 +57,10 @@ cdef class TransitionSystem:
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        const StateC* state, gold) except -1
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+        int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index e035053b314..d1340d68c62 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -3,6 +3,8 @@
 from __future__ import print_function
 
 from cymem.cymem cimport Pool
+from libc.stdlib cimport calloc, free
+from libcpp.vector cimport vector
 
 from collections import Counter
 
@@ -11,6 +13,7 @@ import srsly
 from ...structs cimport TokenC
 from ...typedefs cimport attr_t, weight_t
 from .stateclass cimport StateClass
+from ._parser_utils cimport arg_max_if_valid
 
 from ... import util
 from ...errors import Errors
@@ -74,7 +77,18 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
+    def follow_history(self, doc, history):
+        cdef int clas
+        cdef StateClass state = StateClass(doc)
+        for clas in history:
+            action = self.c[clas]
+            action.do(state.c, action.label)
+            state.c.history.push_back(clas)
+        return state
+
     def get_oracle_sequence(self, Example example, _debug=False):
+        if not self.has_gold(example):
+            return []
         states, golds, _ = self.init_gold_batch([example])
         if not states:
             return []
@@ -86,6 +100,8 @@ cdef class TransitionSystem:
             return self.get_oracle_sequence_from_state(state, gold)
 
     def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
+        if state.is_final():
+            return []
         cdef Pool mem = Pool()
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         assert self.n_moves > 0
@@ -111,6 +127,7 @@ cdef class TransitionSystem:
                             "S0 head?", str(state.has_head(state.S(0))),
                         )))
                     action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                     break
             else:
                 if _debug:
@@ -138,6 +155,28 @@ cdef class TransitionSystem:
             raise ValueError(Errors.E170.format(name=name))
         action = self.lookup_transition(name)
         action.do(state.c, action.label)
+        state.c.history.push_back(action.clas)
+
+    def apply_actions(self, states, const int[::1] actions):
+        assert len(states) == actions.shape[0]
+        cdef StateClass state
+        cdef vector[StateC*] c_states
+        c_states.resize(len(states))
+        cdef int i
+        for (i, state) in enumerate(states):
+            c_states[i] = state.c
+        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
+        return [state for state in states if not state.c.is_final()]
+
+    def transition_states(self, states, float[:, ::1] scores):
+        assert len(states) == scores.shape[0]
+        cdef StateClass state
+        cdef float* c_scores = &scores[0, 0]
+        cdef vector[StateC*] c_states
+        for state in states:
+            c_states.push_back(state.c)
+        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
+        return [state for state in states if not state.c.is_final()]
 
     cdef Transition lookup_transition(self, object name) except *:
         raise NotImplementedError
@@ -250,3 +289,35 @@ cdef class TransitionSystem:
             self.cfg.update(msg['cfg'])
         self.initialize_actions(labels)
         return self
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil:
+        cdef int i
+        cdef Transition action
+        cdef StateC* state
+        for i in range(batch_size):
+            state = states[i]
+            action = moves.c[actions[i]]
+            action.do(state, action.label)
+            state.history.push_back(action.clas)
+
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+    int nr_class, int batch_size) nogil:
+    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
+    cdef int i, guess
+    cdef Transition action
+    for i in range(batch_size):
+        moves.set_valid(is_valid, states[i])
+        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
+        if guess == -1:
+            # This shouldn't happen, but it's hard to raise an error here,
+            # and we don't want to infinite loop. So, force to end state.
+            states[i].force_final()
+        else:
+            action = moves.c[guess]
+            action.do(states[i], action.label)
+            states[i].history.push_back(guess)
+    free(is_valid)
+
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.py
similarity index 97%
rename from spacy/pipeline/dep_parser.pyx
rename to spacy/pipeline/dep_parser.py
index 18a220bd631..370a698c25a 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.py
@@ -5,6 +5,8 @@
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
+from ._parser_internals.arc_eager import ArcEager
 
 from ._parser_internals.arc_eager cimport ArcEager
 from .transition_parser cimport Parser
@@ -19,12 +21,11 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -230,6 +231,7 @@ def parser_score(examples, **kwargs):
 
     DOCS: https://spacy.io/api/dependencyparser#score
     """
+
     def has_sents(doc):
         return doc.has_annotation("SENT_START")
 
@@ -237,8 +239,11 @@ def dep_getter(token, attr):
         dep = getattr(token, attr)
         dep = token.vocab.strings.as_string(dep).lower()
         return dep
+
     results = {}
-    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
+    results.update(
+        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
+    )
     kwargs.setdefault("getter", dep_getter)
     kwargs.setdefault("ignore_labels", ("p", "punct"))
     results.update(Scorer.score_deps(examples, "dep", **kwargs))
@@ -251,11 +256,12 @@ def make_parser_scorer():
     return parser_score
 
 
-cdef class DependencyParser(Parser):
+class DependencyParser(Parser):
     """Pipeline component for dependency parsing.
 
     DOCS: https://spacy.io/api/dependencyparser
     """
+
     TransitionSystem = ArcEager
 
     def __init__(
@@ -275,8 +281,7 @@ def __init__(
         incorrect_spans_key=None,
         scorer=parser_score,
     ):
-        """Create a DependencyParser.
-        """
+        """Create a DependencyParser."""
         super().__init__(
             vocab,
             model,
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.py
similarity index 93%
rename from spacy/pipeline/ner.pyx
rename to spacy/pipeline/ner.py
index bb009dc7a6a..4c2a3ac093c 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.py
@@ -5,6 +5,13 @@
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
+from ._parser_internals.ner import BiluoPushDown
+from ..language import Language
+from ..scorer import get_ner_prf, PRFScore
+from ..training import validate_examples
+from ..util import registry
+from ..training import remove_bilu_prefix
 
 from ._parser_internals.ner cimport BiluoPushDown
 from .transition_parser cimport Parser
@@ -16,12 +23,11 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -46,8 +52,12 @@
         "incorrect_spans_key": None,
         "scorer": {"@scorers": "spacy.ner_scorer.v1"},
     },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
-
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_ner(
     nlp: Language,
@@ -114,7 +124,12 @@ def make_ner(
         "incorrect_spans_key": None,
         "scorer": None,
     },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_beam_ner(
     nlp: Language,
@@ -188,11 +203,12 @@ def make_ner_scorer():
     return ner_score
 
 
-cdef class EntityRecognizer(Parser):
+class EntityRecognizer(Parser):
     """Pipeline component for named entity recognition.
 
     DOCS: https://spacy.io/api/entityrecognizer
     """
+
     TransitionSystem = BiluoPushDown
 
     def __init__(
@@ -210,15 +226,14 @@ def __init__(
         incorrect_spans_key=None,
         scorer=ner_score,
     ):
-        """Create an EntityRecognizer.
-        """
+        """Create an EntityRecognizer."""
         super().__init__(
             vocab,
             model,
             name,
             moves,
             update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,   # not relevant for NER
+            min_action_freq=1,  # not relevant for NER
             learn_tokens=False,  # not relevant for NER
             beam_width=beam_width,
             beam_density=beam_density,
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
deleted file mode 100644
index 7ef20563b12..00000000000
--- a/spacy/pipeline/transition_parser.pxd
+++ /dev/null
@@ -1,31 +0,0 @@
-from cymem.cymem cimport Pool
-from thinc.backends.cblas cimport CBlas
-
-from ..ml.parser_model cimport ActivationsC, SizesC, WeightsC
-from ..vocab cimport Vocab
-from ._parser_internals._state cimport StateC
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
-from .trainable_pipe cimport TrainablePipe
-
-
-cdef class Parser(TrainablePipe):
-    cdef public object _rehearsal_model
-    cdef readonly TransitionSystem moves
-    cdef public object _multitasks
-    cdef object _cpu_ops
-
-    cdef void _parseC(
-        self,
-        CBlas cblas,
-        StateC** states,
-        WeightsC weights,
-        SizesC sizes
-    ) nogil
-
-    cdef void c_transition_batch(
-        self,
-        StateC** states,
-        const float* scores,
-        int nr_class,
-        int batch_size
-    ) nogil
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index feab7e7404b..d71a4ab0355 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -13,30 +13,29 @@ from libc.string cimport memset
 from libcpp.vector cimport vector
 
 import random
+import contextlib
 
 import srsly
 from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
-from thinc.api import chain, softmax_activation, use_ops
+from thinc.api import chain, softmax_activation, use_ops, get_array_module
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d
+from thinc.types import Floats2d, Ints1d
 import numpy.random
 import numpy
 import numpy.random
 import srsly
 from thinc.api import CupyOps, NumpyOps, set_dropout_rate
 
-from ._parser_internals.stateclass cimport StateClass
+from ..ml.tb_framework import TransitionModelInputs
+from ._parser_internals.stateclass cimport StateC, StateClass
 from ._parser_internals.search cimport Beam
-from ..ml.parser_model cimport alloc_activations, free_activations
-from ..ml.parser_model cimport predict_states, arg_max_if_valid
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
-from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
-from ._parser_internals.stateclass cimport StateClass
-
-from .trainable_pipe import TrainablePipe
-
+from .trainable_pipe cimport TrainablePipe
 from ._parser_internals cimport _beam_utils
+from ._parser_internals import _beam_utils
+from ..vocab cimport Vocab
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ..typedefs cimport weight_t
 
 from ..training import validate_examples, validate_get_examples
 from ..training import validate_distillation_examples
@@ -49,7 +48,7 @@ from ._parser_internals import _beam_utils
 NUMPY_OPS = NumpyOps()
 
 
-cdef class Parser(TrainablePipe):
+class Parser(TrainablePipe):
     """
     Base class of the DependencyParser and EntityRecognizer.
     """
@@ -149,8 +148,9 @@ cdef class Parser(TrainablePipe):
     @property
     def move_names(self):
         names = []
+        cdef TransitionSystem moves = self.moves
         for i in range(self.moves.n_moves):
-            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
+            name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
             # Explicitly removing the internal "U-" token used for blocking entities
             if name != "U-":
                 names.append(name)
@@ -256,15 +256,6 @@ cdef class Parser(TrainablePipe):
 
         student_docs = [eg.predicted for eg in examples]
 
-        teacher_step_model = teacher_pipe.model.predict([eg.reference for eg in examples])
-        student_step_model, backprop_tok2vec = self.model.begin_update(student_docs)
-
-        # Add softmax activation, so that we can compute student losses
-        # with cross-entropy loss.
-        with use_ops("numpy"):
-            teacher_model = chain(teacher_step_model, softmax_activation())
-            student_model = chain(student_step_model, softmax_activation())
-        
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
@@ -272,51 +263,39 @@ cdef class Parser(TrainablePipe):
             # sequence, we use the teacher's predictions as the gold
             # standard.
             max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states = self._init_batch(teacher_step_model, student_docs, max_moves)
+            states = self._init_batch(teacher_pipe, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
-        loss = 0.0
-        n_moves = 0
-        while states:
-            # We do distillation as follows: (1) for every state, we compute the
-            # transition softmax distributions: (2) we backpropagate the error of
-            # the student (compared to the teacher) into the student model; (3)
-            # for all states, we move to the next state using the student's
-            # predictions.
-            teacher_scores = teacher_model.predict(states)
-            student_scores, backprop = student_model.begin_update(states)
-            state_loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
-            backprop(d_scores)
-            loss += state_loss
-            self.transition_states(states, student_scores)
-            states = [state for state in states if not state.is_final()]
-
-            # Stop when we reach the maximum number of moves, otherwise we start
-            # to process the remainder of cut sequences again.
-            if max_moves >= 1 and n_moves >= max_moves:
-                break
-            n_moves += 1
+        # We distill as follows: 1. we first let the student predict transition
+        # sequences (and the corresponding transition probabilities); (2) we
+        # let the teacher follow the student's predicted transition sequences
+        # to obtain the teacher's transition probabilities; (3) we compute the
+        # gradients of the student's transition distributions relative to the
+        # teacher's distributions.
+
+        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
+            max_moves=max_moves)
+        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
+        actions = states2actions(student_states)
+        teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
+            moves=self.moves, actions=actions)
+        (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
-        backprop_tok2vec(student_docs)
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+        backprop_scores((student_states, d_scores))
 
         if sgd is not None:
             self.finish_update(sgd)
 
         losses[self.name] += loss
 
-        del backprop
-        del backprop_tok2vec
-        teacher_step_model.clear_memory()
-        student_step_model.clear_memory()
-        del teacher_model
-        del student_model
-
         return losses
 
 
     def get_teacher_student_loss(
-        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
+        normalize: bool=False,
     ) -> Tuple[float, List[Floats2d]]:
         """Calculate the loss and its gradient for a batch of student
         scores, relative to teacher scores.
@@ -328,10 +307,28 @@ cdef class Parser(TrainablePipe):
         
         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
-        d_scores, loss = loss_func(student_scores, teacher_scores)
-        if self.model.ops.xp.isnan(loss):
-            raise ValueError(Errors.E910.format(name=self.name))
+
+        # We can't easily hook up a softmax layer in the parsing model, since
+        # the get_loss does additional masking. So, we could apply softmax
+        # manually here and use Thinc's cross-entropy loss. But it's a bit
+        # suboptimal, since we can have a lot of states that would result in
+        # many kernel launches. Futhermore the parsing model's backprop expects
+        # a XP array, so we'd have to concat the softmaxes anyway. So, like
+        # the get_loss implementation, we'll compute the loss and gradients
+        # ourselves.
+
+        teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
+            axis=-1, inplace=True)
+        student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
+            axis=-1, inplace=True)
+
+        assert teacher_scores.shape == student_scores.shape
+
+        d_scores = student_scores - teacher_scores
+        if normalize:
+            d_scores /= d_scores.shape[0]
+        loss = (d_scores**2).sum() / d_scores.size
+
         return float(loss), d_scores
 
     def init_multitask_objectives(self, get_examples, pipeline, **cfg):
@@ -354,9 +351,6 @@ cdef class Parser(TrainablePipe):
 
         stream: The sequence of documents to process.
         batch_size (int): Number of documents to accumulate into a working set.
-        error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
-            deals with a failing batch of documents. The default function just reraises
-            the exception.
 
         YIELDS (Doc): Documents, in order.
         """
@@ -377,78 +371,29 @@ cdef class Parser(TrainablePipe):
     def predict(self, docs):
         if isinstance(docs, Doc):
             docs = [docs]
+        self._ensure_labels_are_added(docs)
         if not any(len(doc) for doc in docs):
             result = self.moves.init_batch(docs)
             return result
-        if self.cfg["beam_width"] == 1:
-            return self.greedy_parse(docs, drop=0.0)
-        else:
-            return self.beam_parse(
-                docs,
-                drop=0.0,
-                beam_width=self.cfg["beam_width"],
-                beam_density=self.cfg["beam_density"]
-            )
+        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            states_or_beams, _ = self.model.predict(inputs)
+        return states_or_beams
 
     def greedy_parse(self, docs, drop=0.):
-        cdef vector[StateC*] states
-        cdef StateClass state
-        cdef CBlas cblas = self._cpu_ops.cblas()
+        self._resize()
         self._ensure_labels_are_added(docs)
-        set_dropout_rate(self.model, drop)
-        batch = self.moves.init_batch(docs)
-        model = self.model.predict(docs)
-        weights = get_c_weights(model)
-        for state in batch:
-            if not state.is_final():
-                states.push_back(state.c)
-        sizes = get_c_sizes(model, states.size())
-        with nogil:
-            self._parseC(cblas, &states[0], weights, sizes)
-        model.clear_memory()
-        del model
-        return batch
+        with _change_attrs(self.model, beam_width=1):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            states, _ = self.model.predict(inputs)
+        return states
 
     def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
         self._ensure_labels_are_added(docs)
-        batch = _beam_utils.BeamBatch(
-            self.moves,
-            self.moves.init_batch(docs),
-            None,
-            beam_width,
-            density=beam_density
-        )
-        model = self.model.predict(docs)
-        while not batch.is_done:
-            states = batch.get_unfinished_states()
-            if not states:
-                break
-            scores = model.predict(states)
-            batch.advance(scores)
-        model.clear_memory()
-        del model
-        return list(batch)
-
-    cdef void _parseC(
-        self, CBlas cblas, StateC** states, WeightsC weights, SizesC sizes
-    ) nogil:
-        cdef int i
-        cdef vector[StateC*] unfinished
-        cdef ActivationsC activations = alloc_activations(sizes)
-        while sizes.states >= 1:
-            predict_states(cblas, &activations, states, &weights, sizes)
-            # Validate actions, argmax, take action.
-            self.c_transition_batch(
-                states, activations.scores, sizes.classes, sizes.states
-            )
-            for i in range(sizes.states):
-                if not states[i].is_final():
-                    unfinished.push_back(states[i])
-            for i in range(unfinished.size()):
-                states[i] = unfinished[i]
-            sizes.states = unfinished.size()
-            unfinished.clear()
-        free_activations(&activations)
+        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            beams, _ = self.model.predict(inputs)
+        return beams
 
     def set_annotations(self, docs, states_or_beams):
         cdef StateClass state
@@ -459,40 +404,6 @@ cdef class Parser(TrainablePipe):
             for hook in self.postprocesses:
                 hook(doc)
 
-    def transition_states(self, states, float[:, ::1] scores):
-        cdef StateClass state
-        cdef float* c_scores = &scores[0, 0]
-        cdef vector[StateC*] c_states
-        for state in states:
-            c_states.push_back(state.c)
-        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
-        return [state for state in states if not state.c.is_final()]
-
-    cdef void c_transition_batch(
-        self,
-        StateC** states,
-        const float* scores,
-        int nr_class,
-        int batch_size
-    ) nogil:
-        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        with gil:
-            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
-        is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
-        cdef int i, guess
-        cdef Transition action
-        for i in range(batch_size):
-            self.moves.set_valid(is_valid, states[i])
-            guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
-            if guess == -1:
-                # This shouldn't happen, but it's hard to raise an error here,
-                # and we don't want to infinite loop. So, force to end state.
-                states[i].force_final()
-            else:
-                action = self.moves.c[guess]
-                action.do(states[i], action.label)
-        free(is_valid)
-
     def update(self, examples, *, drop=0., sgd=None, losses=None):
         if losses is None:
             losses = {}
@@ -503,66 +414,99 @@ cdef class Parser(TrainablePipe):
         )
         for multitask in self._multitasks:
             multitask.update(examples, drop=drop, sgd=sgd)
+        # We need to take care to act on the whole batch, because we might be
+        # getting vectors via a listener.
         n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
         if n_examples == 0:
             return losses
         set_dropout_rate(self.model, drop)
-        # The probability we use beam update, instead of falling back to
-        # a greedy update
-        beam_update_prob = self.cfg["beam_update_prob"]
-        if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
-            return self.update_beam(
-                examples,
-                beam_width=self.cfg["beam_width"],
-                sgd=sgd,
-                losses=losses,
-                beam_density=self.cfg["beam_density"]
-            )
+        docs = [eg.x for eg in examples if len(eg.x)]
+
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
             # batch uniform length.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states, golds, _ = self._init_gold_batch(
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
+            init_states, gold_states, _ = self._init_gold_batch(
                 examples,
                 max_length=max_moves
             )
         else:
-            states, golds, _ = self.moves.init_gold_batch(examples)
-        if not states:
-            return losses
-        model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
-
-        states_golds = list(zip(states, golds))
-        n_moves = 0
-        while states_golds:
-            states, golds = zip(*states_golds)
-            scores, backprop = model.begin_update(states)
-            d_scores = self.get_batch_loss(states, golds, scores, losses)
-            # Note that the gradient isn't normalized by the batch size
-            # here, because our "samples" are really the states...But we
-            # can't normalize by the number of states either, as then we'd
-            # be getting smaller gradients for states in long sequences.
-            backprop(d_scores)
-            # Follow the predicted action
-            self.transition_states(states, scores)
-            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
-            if max_moves >= 1 and n_moves >= max_moves:
-                break
-            n_moves += 1
+            init_states, gold_states, _ = self.moves.init_gold_batch(examples)
 
-        backprop_tok2vec(golds)
+        inputs = TransitionModelInputs(docs=docs, moves=self.moves,
+            max_moves=max_moves, states=[state.copy() for state in init_states])
+        (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
+        if sum(s.shape[0] for s in scores) == 0:
+            return losses
+        d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
+            examples, max_moves)
+        backprop_scores((pred_states, d_scores))
         if sgd not in (None, False):
             self.finish_update(sgd)
+        losses[self.name] += float((d_scores**2).sum())
         # Ugh, this is annoying. If we're working on GPU, we want to free the
         # memory ASAP. It seems that Python doesn't necessarily get around to
         # removing these in time if we don't explicitly delete? It's confusing.
-        del backprop
-        del backprop_tok2vec
-        model.clear_memory()
-        del model
+        del backprop_scores
         return losses
 
+    def get_loss(self, states_scores, examples, max_moves):
+        gold_states, init_states, pred_states, scores = states_scores
+        scores = self.model.ops.xp.vstack(scores)
+        costs = self._get_costs_from_histories(
+            examples,
+            gold_states,
+            init_states,
+            [list(state.history) for state in pred_states],
+            max_moves
+        )
+        xp = get_array_module(scores)
+        best_costs = costs.min(axis=1, keepdims=True)
+        gscores = scores.copy()
+        min_score = scores.min() - 1000
+        assert costs.shape == scores.shape, (costs.shape, scores.shape)
+        gscores[costs > best_costs] = min_score
+        max_ = scores.max(axis=1, keepdims=True)
+        gmax = gscores.max(axis=1, keepdims=True)
+        exp_scores = xp.exp(scores - max_)
+        exp_gscores = xp.exp(gscores - gmax)
+        Z = exp_scores.sum(axis=1, keepdims=True)
+        gZ = exp_gscores.sum(axis=1, keepdims=True)
+        d_scores = exp_scores / Z
+        d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
+        return d_scores
+
+    def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
+        cdef TransitionSystem moves = self.moves
+        cdef StateClass state
+        cdef int clas
+        cdef int nF = self.model.get_dim("nF")
+        cdef int nO = moves.n_moves
+        cdef int nS = sum([len(history) for history in histories])
+        cdef Pool mem = Pool()
+        cdef np.ndarray costs_i
+        is_valid = <int*>mem.alloc(nO, sizeof(int))
+        batch = list(zip(init_states, histories, gold_states))
+        n_moves = 0
+        output = []
+        while batch:
+            costs = numpy.zeros((len(batch), nO), dtype="f")
+            for i, (state, history, gold) in enumerate(batch):
+                costs_i = costs[i]
+                clas = history.pop(0)
+                moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
+                action = moves.c[clas]
+                action.do(state.c, action.label)
+                state.c.history.push_back(clas)
+            output.append(costs)
+            batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
+            if n_moves >= max_moves >= 1:
+                break
+            n_moves += 1
+
+        return self.model.ops.xp.vstack(output)
+
     def rehearse(self, examples, sgd=None, losses=None, **cfg):
         """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
         if losses is None:
@@ -572,10 +516,9 @@ cdef class Parser(TrainablePipe):
                 multitask.rehearse(examples, losses=losses, sgd=sgd)
         if self._rehearsal_model is None:
             return None
-        losses.setdefault(self.name, 0.)
+        losses.setdefault(self.name, 0.0)
         validate_examples(examples, "Parser.rehearse")
         docs = [eg.predicted for eg in examples]
-        states = self.moves.init_batch(docs)
         # This is pretty dirty, but the NER can resize itself in init_batch,
         # if labels are missing. We therefore have to check whether we need to
         # expand our model output.
@@ -583,95 +526,33 @@ cdef class Parser(TrainablePipe):
         # Prepare the stepwise model, and get the callback for finishing the batch
         set_dropout_rate(self._rehearsal_model, 0.0)
         set_dropout_rate(self.model, 0.0)
-        tutor, _ = self._rehearsal_model.begin_update(docs)
-        model, backprop_tok2vec = self.model.begin_update(docs)
-        n_scores = 0.
-        loss = 0.
-        while states:
-            targets, _ = tutor.begin_update(states)
-            guesses, backprop = model.begin_update(states)
-            d_scores = (guesses - targets) / targets.shape[0]
-            # If all weights for an output are 0 in the original model, don't
-            # supervise that output. This allows us to add classes.
-            loss += (d_scores**2).sum()
-            backprop(d_scores)
-            # Follow the predicted action
-            self.transition_states(states, guesses)
-            states = [state for state in states if not state.is_final()]
-            n_scores += d_scores.size
-        # Do the backprop
-        backprop_tok2vec(docs)
-        if sgd is not None:
-            self.finish_update(sgd)
-        losses[self.name] += loss / n_scores
-        del backprop
-        del backprop_tok2vec
-        model.clear_memory()
-        tutor.clear_memory()
-        del model
-        del tutor
-        return losses
+        student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
+        actions = states2actions(student_states)
+        teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
+        _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
+
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
+
+        teacher_scores = self.model.ops.xp.vstack(teacher_scores)
+        student_scores = self.model.ops.xp.vstack(student_scores)
+        assert teacher_scores.shape == student_scores.shape
+
+        d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
+        # If all weights for an output are 0 in the original model, don't
+        # supervise that output. This allows us to add classes.
+        loss = (d_scores**2).sum() / d_scores.size
+        backprop_scores((student_states, d_scores))
 
-    def update_beam(
-        self,
-        examples,
-        *,
-        beam_width,
-        drop=0.,
-        sgd=None,
-        losses=None,
-        beam_density=0.0
-    ):
-        states, golds, _ = self.moves.init_gold_batch(examples)
-        if not states:
-            return losses
-        # Prepare the stepwise model, and get the callback for finishing the batch
-        model, backprop_tok2vec = self.model.begin_update(
-            [eg.predicted for eg in examples])
-        loss = _beam_utils.update_beam(
-            self.moves,
-            states,
-            golds,
-            model,
-            beam_width,
-            beam_density=beam_density,
-        )
-        losses[self.name] += loss
-        backprop_tok2vec(golds)
         if sgd is not None:
             self.finish_update(sgd)
+        losses[self.name] += loss
 
-    def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
-        cdef StateClass state
-        cdef Pool mem = Pool()
-        cdef int i
-
-        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
+        return losses
 
-        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
-        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
-        cdef np.ndarray d_scores = numpy.zeros(
-            (len(states), self.moves.n_moves), dtype='f', order='C'
-        )
-        c_d_scores = <float*>d_scores.data
-        unseen_classes = self.model.attrs["unseen_classes"]
-        for i, (state, gold) in enumerate(zip(states, golds)):
-            memset(is_valid, 0, self.moves.n_moves * sizeof(int))
-            memset(costs, 0, self.moves.n_moves * sizeof(float))
-            self.moves.set_costs(is_valid, costs, state.c, gold)
-            for j in range(self.moves.n_moves):
-                if costs[j] <= 0.0 and j in unseen_classes:
-                    unseen_classes.remove(j)
-            cpu_log_loss(
-                c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]
-            )
-            c_d_scores += d_scores.shape[1]
-        # Note that we don't normalize this. See comment in update() for why.
-        if losses is not None:
-            losses.setdefault(self.name, 0.)
-            losses[self.name] += (d_scores**2).sum()
-        return d_scores
+    def update_beam(self, examples, *, beam_width,
+            drop=0., sgd=None, losses=None, beam_density=0.0):
+        raise NotImplementedError
 
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
@@ -710,7 +591,7 @@ cdef class Parser(TrainablePipe):
             for example in islice(get_examples(), 10):
                 doc_sample.append(example.predicted)
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize(doc_sample)
+        self.model.initialize((doc_sample, self.moves))
         if nlp is not None:
             self.init_multitask_objectives(get_examples, nlp.pipeline)
 
@@ -803,26 +684,27 @@ cdef class Parser(TrainablePipe):
 
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
-        sequence or a cap. A long
-        doc will get multiple states. Let's say we have a doc of length 2*N,
-        where N is the shortest doc. We'll make two states, one representing
-        long_doc[:N], and another representing long_doc[N:]."""
+        sequence or a cap. A long doc will get multiple states. Let's say we
+        have a doc of length 2*N, where N is the shortest doc. We'll make
+        two states, one representing long_doc[:N], and another representing
+        long_doc[N:]."""
         cdef:
             StateClass start_state
             StateClass state
             Transition action
-        all_states = self.moves.init_batch([eg.predicted for eg in examples])
+            TransitionSystem moves = self.moves
+        all_states = moves.init_batch([eg.predicted for eg in examples])
         states = []
         golds = []
         to_cut = []
         for state, eg in zip(all_states, examples):
-            if self.moves.has_gold(eg) and not state.is_final():
-                gold = self.moves.init_gold(state, eg)
+            if moves.has_gold(eg) and not state.is_final():
+                gold = moves.init_gold(state, eg)
                 if len(eg.x) < max_length:
                     states.append(state)
                     golds.append(gold)
                 else:
-                    oracle_actions = self.moves.get_oracle_sequence_from_state(
+                    oracle_actions = moves.get_oracle_sequence_from_state(
                         state.copy(), gold)
                     to_cut.append((eg, state, gold, oracle_actions))
         if not to_cut:
@@ -832,13 +714,52 @@ cdef class Parser(TrainablePipe):
             for i in range(0, len(oracle_actions), max_length):
                 start_state = state.copy()
                 for clas in oracle_actions[i:i+max_length]:
-                    action = self.moves.c[clas]
+                    action = moves.c[clas]
                     action.do(state.c, action.label)
                     if state.is_final():
                         break
-                if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
+                if moves.has_gold(eg, start_state.B(0), state.B(0)):
                     states.append(start_state)
                     golds.append(gold)
                 if state.is_final():
                     break
         return states, golds, max_length
+
+
+@contextlib.contextmanager
+def _change_attrs(model, **kwargs):
+    """Temporarily modify a thinc model's attributes."""
+    unset = object()
+    old_attrs = {}
+    for key, value in kwargs.items():
+        old_attrs[key] = model.attrs.get(key, unset)
+        model.attrs[key] = value
+    yield model
+    for key, value in old_attrs.items():
+        if value is unset:
+            model.attrs.pop(key)
+        else:
+            model.attrs[key] = value
+
+
+def states2actions(states: List[StateClass]) -> List[Ints1d]:
+    cdef int step
+    cdef StateClass state
+    cdef StateC* c_state
+    actions = []
+    while True:
+        step = len(actions)
+
+        step_actions = []
+        for state in states:
+            c_state = state.c
+            if step < c_state.history.size():
+                step_actions.append(c_state.history[step])
+
+        # We are done if we have exhausted all histories.
+        if len(step_actions) == 0:
+            break
+
+        actions.append(numpy.array(step_actions, dtype="i"))
+
+    return actions
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 54ee053981f..b2c39ae88bc 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -16,6 +16,8 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
+from thinc.api import fix_random_seed
+import logging
 
 from ..util import make_tempdir
 
@@ -412,7 +414,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -539,11 +541,11 @@ def test_block_ner():
     assert [token.ent_type_ for token in doc] == expected_types
 
 
-@pytest.mark.parametrize("use_upper", [True, False])
-def test_overfitting_IO(use_upper):
+def test_overfitting_IO():
+    fix_random_seed(1)
     # Simple test to try and quickly overfit the NER component
     nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
+    ner = nlp.add_pipe("ner", config={"model": {}})
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -575,7 +577,6 @@ def test_overfitting_IO(use_upper):
         assert ents2[0].label_ == "LOC"
         # Ensure that the predictions are still the same, even after adding a new label
         ner2 = nlp2.get_pipe("ner")
-        assert ner2.model.attrs["has_upper"] == use_upper
         ner2.add_label("RANDOM_NEW_LABEL")
         doc3 = nlp2(test_text)
         ents3 = doc3.ents
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index a943c3538e0..a6e1852514d 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,16 +1,17 @@
+import itertools
 import pytest
+import numpy
 from numpy.testing import assert_equal
 from thinc.api import Adam
 
 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.pipeline import DependencyParser
-from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
-from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
-from spacy.tokens import Doc
 from spacy.training import Example
+from spacy.tokens import Doc
 from spacy.vocab import Vocab
+from spacy import util, registry
+from thinc.api import fix_random_seed
 
 from ..util import apply_transition_sequence, make_tempdir
 
@@ -59,6 +60,8 @@
     ),
 ]
 
+PARSERS = ["parser"]  # TODO: Test beam_parser when ready
+
 eps = 0.1
 
 
@@ -171,6 +174,57 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
     assert doc[0].dep != 0
 
 
+def test_parser_apply_actions(en_vocab, en_parser):
+    words = ["I", "ate", "pizza"]
+    words2 = ["Eat", "more", "pizza", "!"]
+    doc1 = Doc(en_vocab, words=words)
+    doc2 = Doc(en_vocab, words=words2)
+    docs = [doc1, doc2]
+
+    moves = en_parser.moves
+    moves.add_action(0, "")
+    moves.add_action(1, "")
+    moves.add_action(2, "nsubj")
+    moves.add_action(3, "obj")
+    moves.add_action(2, "amod")
+
+    actions = [
+        numpy.array([0, 0], dtype="i"),
+        numpy.array([2, 0], dtype="i"),
+        numpy.array([0, 4], dtype="i"),
+        numpy.array([3, 3], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([0], dtype="i"),
+        numpy.array([1], dtype="i"),
+    ]
+
+    states = moves.init_batch(docs)
+    active_states = states
+
+    for step_actions in actions:
+        active_states = moves.apply_actions(active_states, step_actions)
+
+    assert len(active_states) == 0
+
+    for (state, doc) in zip(states, docs):
+        moves.set_annotations(state, doc)
+
+    assert docs[0][0].head.i == 1
+    assert docs[0][0].dep_ == "nsubj"
+    assert docs[0][1].head.i == 1
+    assert docs[0][1].dep_ == "ROOT"
+    assert docs[0][2].head.i == 1
+    assert docs[0][2].dep_ == "obj"
+
+    assert docs[1][0].head.i == 0
+    assert docs[1][0].dep_ == "ROOT"
+    assert docs[1][1].head.i == 2
+    assert docs[1][1].dep_ == "amod"
+    assert docs[1][2].head.i == 0
+    assert docs[1][2].dep_ == "obj"
+
+
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
@@ -319,7 +373,7 @@ def test_parser_constructor(en_vocab):
     DependencyParser(en_vocab, model)
 
 
-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+@pytest.mark.parametrize("pipe_name", PARSERS)
 def test_incomplete_data(pipe_name):
     # Test that the parser works with incomplete information
     nlp = English()
@@ -345,11 +399,15 @@ def test_incomplete_data(pipe_name):
     assert doc[2].head.i == 1
 
 
-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
-def test_overfitting_IO(pipe_name):
+@pytest.mark.parametrize(
+    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
+)
+def test_overfitting_IO(pipe_name, max_moves):
+    fix_random_seed(0)
     # Simple test to try and quickly overfit the dependency parser (normal or beam)
     nlp = English()
     parser = nlp.add_pipe(pipe_name)
+    parser.cfg["update_with_oracle_cut_size"] = max_moves
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -451,10 +509,12 @@ def test_distill():
 @pytest.mark.parametrize(
     "parser_config",
     [
-        # TransitionBasedParser V1
-        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        # TransitionBasedParser V2
+        # TODO: re-enable after we have a spacy-legacy release for v4. See
+        # https://github.com/explosion/spacy-legacy/pull/36
+        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
         ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
+        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
     ],
 )
 # fmt: on
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 998f0472c7e..9648341a106 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -384,7 +384,7 @@ def test_replace_listeners():
     factory = "ner"
 
     [components.ner.model]
-    @architectures = "spacy.TransitionBasedParser.v2"
+    @architectures = "spacy.TransitionBasedParser.v3"
 
     [components.ner.model.tok2vec]
     @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index b36d3ad7473..dd0a53c910e 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -130,33 +130,11 @@
 
 parser_config_string_upper = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
-use_upper = true
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = null
-width = 333
-depth = 4
-embed_size = 5555
-window_size = 1
-maxout_pieces = 7
-subword_features = false
-"""
-
-
-parser_config_string_no_upper = """
-[model]
-@architectures = "spacy.TransitionBasedParser.v2"
-state_type = "parser"
-extra_state_tokens = false
-hidden_width = 66
-maxout_pieces = 2
-use_upper = false
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
@@ -187,7 +165,6 @@ def my_parser():
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
-        use_upper=True,
     )
     return parser
 
@@ -293,15 +270,16 @@ def test_serialize_custom_nlp():
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        assert model.get_ref("upper").get_dim("nI") == 65
-        assert model.get_ref("lower").get_dim("nI") == 65
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("W")
+        assert output.has_param("b")
 
 
-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_serialize_parser(parser_config_string):
     """Create a non-default parser config to check nlp serializes it correctly"""
     nlp = English()
@@ -314,11 +292,13 @@ def test_serialize_parser(parser_config_string):
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        if model.attrs["has_upper"]:
-            assert model.get_ref("upper").get_dim("nI") == 66
-        assert model.get_ref("lower").get_dim("nI") == 66
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("b")
+        assert output.has_param("W")
 
 
 def test_config_nlp_roundtrip():
@@ -514,9 +494,7 @@ def test_config_auto_fill_extra_fields():
     load_model_from_config(nlp.config)
 
 
-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_config_validate_literal(parser_config_string):
     nlp = English()
     config = Config().from_str(parser_config_string)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index d2a41ff0fed..160e2ecf335 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,24 +1,13 @@
 import ctypes
 import os
 from pathlib import Path
-
-import pytest
-
-try:
-    from pydantic.v1 import ValidationError
-except ImportError:
-    from pydantic import ValidationError  # type: ignore
-
-from thinc.api import (
-    Config,
-    ConfigValidationError,
-    CupyOps,
-    MPSOps,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-    set_current_ops,
-)
+from spacy.about import __version__ as spacy_version
+from spacy import util
+from spacy import prefer_gpu, require_gpu, require_cpu
+from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
+from spacy.util import find_available_port
+from thinc.api import Config, Optimizer, ConfigValidationError
+from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
@@ -101,34 +90,6 @@ def test_util_get_package_path(package):
     assert isinstance(path, Path)
 
 
-def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
-    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
-    assert model.get_param("W").shape == (nF, nO, nP, nI)
-    tensor = model.ops.alloc((10, nI))
-    Y, get_dX = model.begin_update(tensor)
-    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
-    dY = model.ops.alloc((15, nO, nP))
-    ids = model.ops.alloc((15, nF))
-    ids[1, 2] = -1
-    dY[1] = 1
-    assert not model.has_grad("pad")
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 2, 0, 0] == 1.0
-    ids.fill(0.0)
-    dY.fill(0.0)
-    dY[0] = 0
-    ids[1, 2] = 0
-    ids[1, 1] = -1
-    ids[1, 0] = -1
-    dY[1] = 1
-    ids[2, 0] = -1
-    dY[2] = 5
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 0, 0, 0] == 6
-    assert d_pad[0, 1, 0, 0] == 1
-    assert d_pad[0, 2, 0, 0] == 0
-
-
 def test_prefer_gpu():
     current_ops = get_current_ops()
     if has_cupy_gpu:
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 914e877f579..b2c93f24bfa 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,6 +1,5 @@
 # cython: profile=False
 from collections.abc import Iterable as IterableInstance
-
 import numpy
 
 from murmurhash.mrmr cimport hash64
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 956234ac0d4..db8f974ea19 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -833,18 +833,17 @@ for a Tok2Vec layer.
 
 ## Parser & NER architectures {id="parser"}
 
-### spacy.TransitionBasedParser.v2 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
+### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TransitionBasedParser.v2"
+> @architectures = "spacy.TransitionBasedParser.v3"
 > state_type = "ner"
 > extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
-> use_upper = true
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v2"
@@ -874,23 +873,22 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
-| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
-| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
-| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
-| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
-| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
-| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
+| Name                 | Description                                                                                                                                                       |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                        |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                               |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                            |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~                                                             |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                       |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                     |
 
 <Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>
 
 [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
 same signature, but the `use_upper` argument was `True` by default.
 
-</Accordion>
+ </Accordion>
 
 ## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 47028f4a2e7..acc2ce1caa2 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -408,7 +408,7 @@ Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v3
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
@@ -418,7 +418,7 @@ Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v3
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]
@@ -743,7 +743,7 @@ scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 - hidden_width = 64
@@ -766,7 +766,7 @@ scorer = {"@scorers":"spacy.parser_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx
index b44df538766..44c80622437 100644
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@@ -302,7 +302,7 @@ the others, but may not be as accurate, especially if texts are short.
 ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 
 Identical to
-[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
+[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser)
 except the `use_upper` was set to `True` by default.
 
 ## Layers {id="layers"}
diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 2bd2856b6a3..1b0bc9606e9 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -140,7 +140,7 @@ factory = "tok2vec"
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v3"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -156,7 +156,7 @@ same. This makes them fully independent and doesn't require an upstream
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v3"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
@@ -472,7 +472,7 @@ sneakily delegates to the `Transformer` pipeline component.
 factory = "ner"
 
 [nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 128

From 9070023fa172288f9102478c20ac25ef8c90259f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 18 Jan 2023 18:28:30 +0100
Subject: [PATCH 151/504] Fix batching regression (#12094)

* Fix batching regression

Some time ago, the spaCy v4 branch switched to the new Thinc v9
schedule. However, this introduced an error in how batching is handed.

In the PR, the batchers were changed to keep track of their step,
so that the step can be passed to the schedule. However, the issue
is that the training loop repeatedly calls the batching functions
(rather than using an infinite generator/iterator). So, the step and
therefore the schedule would be reset each epoch. Before the schedule
switch we didn't have this issue, because the old schedules were
stateful.

This PR fixes this issue by reverting the batching functions to use
a (stateful) generator. Their registry functions do accept a `Schedule`
and we convert `Schedule`s to generators.

* Update batcher docs

* Docstring fixes

* Make minibatch take iterables again as well

* Bump thinc requirement to 9.0.0.dev2

* Use type declaration

* Convert another comment into a proper type declaration
---
 pyproject.toml                        |  2 +-
 requirements.txt                      |  2 +-
 setup.cfg                             |  2 +-
 spacy/tests/pipeline/test_tagger.py   |  4 +-
 spacy/tests/pipeline/test_textcat.py  |  8 +++-
 spacy/tests/training/test_training.py |  4 +-
 spacy/training/batchers.py            | 58 ++++++++++++++-------------
 spacy/util.py                         |  8 ++--
 website/docs/api/top-level.mdx        | 30 +++++++-------
 9 files changed, 64 insertions(+), 54 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 58e4382e829..3891d137867 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev1,<9.1.0",
+    "thinc>=9.0.0.dev2,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 8b5cc51f875..43f36f145ea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev1,<9.1.0
+thinc>=9.0.0.dev2,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 613410c3347..e8b47eebb9e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,7 +38,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev1,<9.1.0
+    thinc>=9.0.0.dev2,<9.1.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 5da5c209975..b6f94f7f97b 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -24,7 +24,9 @@ def test_issue4348():
     optimizer = nlp.initialize()
     for i in range(5):
         losses = {}
-        batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+        batches = util.minibatch(
+            TRAIN_DATA, size=compounding(4.0, 32.0, 1.001).to_generator()
+        )
         for batch in batches:
             nlp.update(batch, sgd=optimizer, losses=losses)
 
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 214c1bfbed1..2383c36bb01 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -100,7 +100,9 @@ def test_issue3611():
         optimizer = nlp.initialize()
         for i in range(3):
             losses = {}
-            batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+            batches = util.minibatch(
+                train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
+            )
 
             for batch in batches:
                 nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
@@ -137,7 +139,9 @@ def test_issue4030():
         optimizer = nlp.initialize()
         for i in range(3):
             losses = {}
-            batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+            batches = util.minibatch(
+                train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
+            )
 
             for batch in batches:
                 nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 68f7e8a0d57..ef20ec365c6 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -924,7 +924,9 @@ def _train_tuples(train_data):
     optimizer = nlp.initialize()
     for i in range(5):
         losses = {}
-        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
+        batches = minibatch(
+            train_examples, size=compounding(4.0, 32.0, 1.001).to_generator()
+        )
         for batch in batches:
             nlp.update(batch, sgd=optimizer, losses=losses)
 
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index 519e61315da..469bb263016 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,9 +1,10 @@
 import itertools
-from thinc.schedules import Schedule, constant as constant_schedule
+from thinc.schedules import Schedule
 
 from ..util import minibatch, registry
 
-Sizing = Union[Sequence[int], int, Schedule[int]]
+SizingSchedule = Union[Iterable[int], int, Schedule]
+Sizing = Union[Iterable[int], int]
 ItemT = TypeVar("ItemT")
 BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
 
@@ -11,7 +12,7 @@
 @registry.batchers("spacy.batch_by_padded.v1")
 def configure_minibatch_by_padded_size(
     *,
-    size: Sizing,
+    size: SizingSchedule,
     buffer: int,
     discard_oversize: bool,
     get_length: Optional[Callable[[ItemT], int]] = None
@@ -21,8 +22,8 @@ def configure_minibatch_by_padded_size(
     The padded size is defined as the maximum length of sequences within the
     batch multiplied by the number of sequences in the batch.
 
-    size (int or Sequence[int]): The largest padded size to batch sequences into.
-        Can be a single integer, or a sequence, allowing for variable batch sizes.
+    size (int, Iterable[int] or Schedule): The largest padded size to batch sequences
+        into. Can be a single integer, or a sequence, allowing for variable batch sizes.
     buffer (int): The number of sequences to accumulate before sorting by length.
         A larger buffer will result in more even sizing, but if the buffer is
         very large, the iteration order will be less random, which can result
@@ -36,7 +37,7 @@ def configure_minibatch_by_padded_size(
     optionals = {"get_length": get_length} if get_length is not None else {}
     return partial(
         minibatch_by_padded_size,
-        size=size,
+        size=_schedule_to_sizing(size),
         buffer=buffer,
         discard_oversize=discard_oversize,
         **optionals
@@ -46,14 +47,14 @@ def configure_minibatch_by_padded_size(
 @registry.batchers("spacy.batch_by_words.v1")
 def configure_minibatch_by_words(
     *,
-    size: Sizing,
+    size: SizingSchedule,
     tolerance: float,
     discard_oversize: bool,
     get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
     """Create a batcher that uses the "minibatch by words" strategy.
 
-    size (int or Sequence[int]): The target number of words per batch.
+    size (int, Iterable[int] or Schedule): The target number of words per batch.
         Can be a single integer, or a sequence, allowing for variable batch sizes.
     tolerance (float): What percentage of the size to allow batches to exceed.
     discard_oversize (bool): Whether to discard sequences that by themselves
@@ -64,7 +65,7 @@ def configure_minibatch_by_words(
     optionals = {"get_length": get_length} if get_length is not None else {}
     return partial(
         minibatch_by_words,
-        size=size,
+        size=_schedule_to_sizing(size),
         tolerance=tolerance,
         discard_oversize=discard_oversize,
         **optionals
@@ -73,15 +74,15 @@ def configure_minibatch_by_words(
 
 @registry.batchers("spacy.batch_by_sequence.v1")
 def configure_minibatch(
-    size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
+    size: SizingSchedule, get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
     """Create a batcher that creates batches of the specified size.
 
-    size (int or Sequence[int]): The target number of items per batch.
+    size (int, Iterable[int] or Schedule): The target number of items per batch.
         Can be a single integer, or a sequence, allowing for variable batch sizes.
     """
     optionals = {"get_length": get_length} if get_length is not None else {}
-    return partial(minibatch, size=size, **optionals)
+    return partial(minibatch, size=_schedule_to_sizing(size), **optionals)
 
 
 def minibatch_by_padded_size(
@@ -97,7 +98,7 @@ def minibatch_by_padded_size(
     The padded size is defined as the maximum length of sequences within the
     batch multiplied by the number of sequences in the batch.
 
-    size (int or Sequence[int]): The largest padded size to batch sequences into.
+    size (int or Iterable[int]): The largest padded size to batch sequences into.
     buffer (int): The number of sequences to accumulate before sorting by length.
         A larger buffer will result in more even sizing, but if the buffer is
         very large, the iteration order will be less random, which can result
@@ -108,13 +109,12 @@ def minibatch_by_padded_size(
         The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_: Iterator[int] = itertools.repeat(size)
     else:
-        assert isinstance(size, Schedule)
-        size_ = size
-    for step, outer_batch in enumerate(minibatch(seqs, size=buffer)):
+        size_ = iter(size)
+    for outer_batch in minibatch(seqs, size=buffer):
         outer_batch = list(outer_batch)
-        target_size = size_(step)
+        target_size = next(size_)
         for indices in _batch_by_length(outer_batch, target_size, get_length):
             subbatch = [outer_batch[i] for i in indices]
             padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
@@ -136,7 +136,7 @@ def minibatch_by_words(
     themselves, or be discarded if discard_oversize=True.
 
     seqs (Iterable[Sequence]): The sequences to minibatch.
-    size (int or Sequence[int]): The target number of words per batch.
+    size (int or Iterable[int]): The target number of words per batch.
         Can be a single integer, or a sequence, allowing for variable batch sizes.
     tolerance (float): What percentage of the size to allow batches to exceed.
     discard_oversize (bool): Whether to discard sequences that by themselves
@@ -145,12 +145,10 @@ def minibatch_by_words(
         item. The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_: Iterator[int] = itertools.repeat(size)
     else:
-        assert isinstance(size, Schedule)
-        size_ = size
-    step = 0
-    target_size = size_(step)
+        size_ = iter(size)
+    target_size = next(size_)
     tol_size = target_size * tolerance
     batch = []
     overflow = []
@@ -175,8 +173,7 @@ def minibatch_by_words(
         else:
             if batch:
                 yield batch
-            step += 1
-            target_size = size_(step)
+            target_size = next(size_)
             tol_size = target_size * tolerance
             batch = overflow
             batch_size = overflow_size
@@ -194,8 +191,7 @@ def minibatch_by_words(
             else:
                 if batch:
                     yield batch
-                step += 1
-                target_size = size_(step)
+                target_size = next(size_)
                 tol_size = target_size * tolerance
                 batch = [seq]
                 batch_size = n_words
@@ -232,3 +228,9 @@ def _batch_by_length(
     batches = [list(sorted(batch)) for batch in batches]
     batches.reverse()
     return batches
+
+
+def _schedule_to_sizing(size: SizingSchedule) -> Sizing:
+    if isinstance(size, Schedule):
+        return size.to_generator()
+    return size
diff --git a/spacy/util.py b/spacy/util.py
index 551f78cc969..dedcd17ea58 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1642,12 +1642,12 @@ def minibatch(items, size):
     so that batch-size can vary on each step.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_ = itertools.repeat(size)
     else:
-        size_ = size
+        size_ = iter(size)
     items = iter(items)
-    for step in itertools.count():
-        batch_size = size_(step)
+    while True:
+        batch_size = next(size_)
         batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 77216924405..8555d64ba63 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -878,14 +878,14 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
 > get_length = null
 > ```
 
-| Name               | Description                                                                                                                                                                             |
-| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `seqs`             | The sequences to minibatch. ~~Iterable[Any]~~                                                                                                                                           |
-| `size`             | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
-| `tolerance`        | What percentage of the size to allow batches to exceed. ~~float~~                                                                                                                       |
-| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~                                                                                                     |
-| `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                 |
-| **CREATES**        | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                     |
+| Name               | Description                                                                                                                                                                                       |
+| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `seqs`             | The sequences to minibatch. ~~Iterable[Any]~~                                                                                                                                                     |
+| `size`             | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~ |
+| `tolerance`        | What percentage of the size to allow batches to exceed. ~~float~~                                                                                                                                 |
+| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~                                                                                                               |
+| `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                           |
+| **CREATES**        | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                               |
 
 ### spacy.batch_by_sequence.v1 {id="batch_by_sequence",tag="registered function"}
 
@@ -900,11 +900,11 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
 
 Create a batcher that creates batches of the specified size.
 
-| Name         | Description                                                                                                                                                                             |
-| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `size`       | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
-| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                 |
-| **CREATES**  | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                     |
+| Name         | Description                                                                                                                                                                                       |
+| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `size`       | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~ |
+| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                           |
+| **CREATES**  | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                               |
 
 ### spacy.batch_by_padded.v1 {id="batch_by_padded",tag="registered function"}
 
@@ -926,7 +926,7 @@ sequences in the batch.
 
 | Name               | Description                                                                                                                                                                                                                                 |
 | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `size`             | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~                                          |
+| `size`             | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~                                |
 | `buffer`           | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. ~~int~~ |
 | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~                                                                                                                                     |
 | `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                                                                     |
@@ -1528,7 +1528,7 @@ vary on each step.
 | Name       | Description                                      |
 | ---------- | ------------------------------------------------ |
 | `items`    | The items to batch up. ~~Iterable[Any]~~         |
-| `size`     | The batch size(s). ~~Union[int, Sequence[int]]~~ |
+| `size`     | The batch size(s). ~~Union[int, Iterable[int]]~~ |
 | **YIELDS** | The batches.                                     |
 
 ### util.filter_spans {id="util.filter_spans",tag="function",version="2.1.4"}

From c27fca84cff1c7ae6f87ba18578897f4b5e61211 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 19 Jan 2023 09:25:34 +0100
Subject: [PATCH 152/504] Set version to v4.0.0.dev0 (#12126)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index f5ee66dae6f..1ce8a44c9a4 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,5 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.7.4"
+__version__ = "4.0.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 6ba4d8cf1d87de9c271d4ad9fb4adfa1e5869281 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 25 Jan 2023 12:50:21 +0900
Subject: [PATCH 153/504] Refactor lexeme mem passing (#12125)

* Don't pass mem pool to new lexeme function

* Remove unused mem from function args

Two methods calling _new_lexeme, get and get_by_orth, took mem arguments
just to call the internal method. That's no longer necessary, so this
cleans it up.

* prettier formatting

* Remove more unused mem args
---
 spacy/lexeme.pyx                    |  2 +-
 spacy/tokenizer.pxd                 | 76 ++++++++---------------------
 spacy/tokenizer.pyx                 | 39 +++++++--------
 spacy/tokens/doc.pyx                |  8 +--
 spacy/tokens/retokenizer.pyx        |  4 +-
 spacy/vocab.pxd                     |  7 ++-
 spacy/vocab.pyx                     | 30 ++++--------
 website/docs/api/cython-classes.mdx | 20 ++++----
 8 files changed, 67 insertions(+), 119 deletions(-)

diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 3e63afa34ba..41fc8f1d2b1 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -57,7 +57,7 @@ cdef class Lexeme:
         """
         self.vocab = vocab
         self.orth = orth
-        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
+        self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
         if self.c.orth != orth:
             raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
 
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index c963dcbcfa4..58d30c3202f 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -31,58 +31,24 @@ cdef class Tokenizer:
 
     cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
     cdef int _apply_special_cases(self, Doc doc) except -1
-    cdef void _filter_special_spans(
-        self,
-        vector[SpanC] &original,
-        vector[SpanC] &filtered,
-        int doc_len,
-    ) nogil
-    cdef object _prepare_special_spans(
-        self,
-        Doc doc,
-        vector[SpanC] &filtered,
-    )
-    cdef int _retokenize_special_spans(
-        self,
-        Doc doc,
-        TokenC* tokens,
-        object span_data,
-    )
-    cdef int _try_specials_and_cache(
-        self,
-        hash_t key,
-        Doc tokens,
-        int* has_special,
-        bint with_special_cases,
-    ) except -1
-    cdef int _tokenize(
-        self,
-        Doc tokens,
-        str span,
-        hash_t key,
-        int* has_special,
-        bint with_special_cases,
-    ) except -1
-    cdef str _split_affixes(
-        self,
-        Pool mem,
-        str string,
-        vector[LexemeC*] *prefixes,
-        vector[LexemeC*] *suffixes, int* has_special,
-        bint with_special_cases,
-    )
-    cdef int _attach_tokens(
-        self,
-        Doc tokens,
-        str string,
-        vector[LexemeC*] *prefixes,
-        vector[LexemeC*] *suffixes, int* has_special,
-        bint with_special_cases,
-    ) except -1
-    cdef int _save_cached(
-        self,
-        const TokenC* tokens,
-        hash_t key,
-        int* has_special,
-        int n,
-    ) except -1
+    cdef void _filter_special_spans(self, vector[SpanC] &original,
+                            vector[SpanC] &filtered, int doc_len) nogil
+    cdef object _prepare_special_spans(self, Doc doc,
+                                       vector[SpanC] &filtered)
+    cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
+                                       object span_data)
+    cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
+                                     int* has_special,
+                                     bint with_special_cases) except -1
+    cdef int _tokenize(self, Doc tokens, str span, hash_t key,
+                       int* has_special, bint with_special_cases) except -1
+    cdef str _split_affixes(self, str string,
+                                vector[LexemeC*] *prefixes,
+                                vector[LexemeC*] *suffixes, int* has_special,
+                                bint with_special_cases)
+    cdef int _attach_tokens(self, Doc tokens, str string,
+                            vector[LexemeC*] *prefixes,
+                            vector[LexemeC*] *suffixes, int* has_special,
+                            bint with_special_cases) except -1
+    cdef int _save_cached(self, const TokenC* tokens, hash_t key,
+                          int* has_special, int n) except -1
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index cdb7dda7094..12a78d39fc4 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -393,22 +393,19 @@ cdef class Tokenizer:
         cdef vector[LexemeC*] suffixes
         cdef int orig_size
         orig_size = tokens.length
-        span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
+        span = self._split_affixes(span, &prefixes, &suffixes,
                                    has_special, with_special_cases)
         self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
                             with_special_cases)
         self._save_cached(&tokens.c[orig_size], orig_key, has_special,
                           tokens.length - orig_size)
 
-    cdef str _split_affixes(
-        self,
-        Pool mem,
-        str string,
-        vector[const LexemeC*] *prefixes,
-        vector[const LexemeC*] *suffixes,
-        int* has_special,
-        bint with_special_cases
-    ):
+    cdef str _split_affixes(self, str string,
+                                vector[const LexemeC*] *prefixes,
+                                vector[const LexemeC*] *suffixes,
+                                int* has_special,
+                                bint with_special_cases):
+        cdef size_t i
         cdef str prefix
         cdef str suffix
         cdef str minus_pre
@@ -426,7 +423,7 @@ cdef class Tokenizer:
                 minus_pre = string[pre_len:]
                 if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
                     string = minus_pre
-                    prefixes.push_back(self.vocab.get(mem, prefix))
+                    prefixes.push_back(self.vocab.get(prefix))
                     break
             suf_len = self.find_suffix(string[pre_len:])
             if suf_len != 0:
@@ -434,18 +431,18 @@ cdef class Tokenizer:
                 minus_suf = string[:-suf_len]
                 if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
                     string = minus_suf
-                    suffixes.push_back(self.vocab.get(mem, suffix))
+                    suffixes.push_back(self.vocab.get(suffix))
                     break
             if pre_len and suf_len and (pre_len + suf_len) <= len(string):
                 string = string[pre_len:-suf_len]
-                prefixes.push_back(self.vocab.get(mem, prefix))
-                suffixes.push_back(self.vocab.get(mem, suffix))
+                prefixes.push_back(self.vocab.get(prefix))
+                suffixes.push_back(self.vocab.get(suffix))
             elif pre_len:
                 string = minus_pre
-                prefixes.push_back(self.vocab.get(mem, prefix))
+                prefixes.push_back(self.vocab.get(prefix))
             elif suf_len:
                 string = minus_suf
-                suffixes.push_back(self.vocab.get(mem, suffix))
+                suffixes.push_back(self.vocab.get(suffix))
         return string
 
     cdef int _attach_tokens(self, Doc tokens, str string,
@@ -470,11 +467,11 @@ cdef class Tokenizer:
                 # We're always saying 'no' to spaces here -- the caller will
                 # fix up the outermost one, with reference to the original.
                 # See Issue #859
-                tokens.push_back(self.vocab.get(tokens.mem, string), False)
+                tokens.push_back(self.vocab.get(string), False)
             else:
                 matches = self.find_infix(string)
                 if not matches:
-                    tokens.push_back(self.vocab.get(tokens.mem, string), False)
+                    tokens.push_back(self.vocab.get(string), False)
                 else:
                     # Let's say we have dyn-o-mite-dave - the regex finds the
                     # start and end positions of the hyphens
@@ -489,7 +486,7 @@ cdef class Tokenizer:
 
                         if infix_start != start:
                             span = string[start:infix_start]
-                            tokens.push_back(self.vocab.get(tokens.mem, span), False)
+                            tokens.push_back(self.vocab.get(span), False)
 
                         if infix_start != infix_end:
                             # If infix_start != infix_end, it means the infix
@@ -497,11 +494,11 @@ cdef class Tokenizer:
                             # for tokenization in some languages (see
                             # https://github.com/explosion/spaCy/issues/768)
                             infix_span = string[infix_start:infix_end]
-                            tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
+                            tokens.push_back(self.vocab.get(infix_span), False)
                         start = infix_end
                     span = string[start:]
                     if span:
-                        tokens.push_back(self.vocab.get(tokens.mem, span), False)
+                        tokens.push_back(self.vocab.get(span), False)
         cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
         while it != suffixes.rend():
             lexeme = deref(it)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 09dc94297f0..56ee216d17f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -302,12 +302,12 @@ cdef class Doc:
         cdef const LexemeC* lexeme
         for word, has_space in zip(words, spaces):
             if isinstance(word, str):
-                lexeme = self.vocab.get(self.mem, word)
+                lexeme = self.vocab.get(word)
             elif isinstance(word, bytes):
                 raise ValueError(Errors.E028.format(value=word))
             else:
                 try:
-                    lexeme = self.vocab.get_by_orth(self.mem, word)
+                    lexeme = self.vocab.get_by_orth(word)
                 except TypeError:
                     raise TypeError(Errors.E1022.format(wtype=type(word)))
             self.push_back(lexeme, has_space)
@@ -1475,7 +1475,7 @@ cdef class Doc:
             end = start + attrs[i, 0]
             has_space = attrs[i, 1]
             orth_ = text[start:end]
-            lex = self.vocab.get(self.mem, orth_)
+            lex = self.vocab.get(orth_)
             self.push_back(lex, has_space)
             start = end + has_space
         self.from_array(msg["array_head"][2:], attrs[:, 2:])
@@ -1580,7 +1580,7 @@ cdef class Doc:
         assert words == reconstructed_words
 
         for word, has_space in zip(words, spaces):
-            lex = self.vocab.get(self.mem, word)
+            lex = self.vocab.get(word)
             self.push_back(lex, has_space)
 
         # Set remaining token-level attributes via Doc.from_array().
diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx
index d3e9c5674cc..c0052ca9a9a 100644
--- a/spacy/tokens/retokenizer.pyx
+++ b/spacy/tokens/retokenizer.pyx
@@ -220,7 +220,7 @@ def _merge(Doc doc, merges):
             if doc.vocab.vectors_length > 0:
                 doc.vocab.set_vector(new_orth, span.vector)
         token = tokens[token_index]
-        lex = doc.vocab.get(doc.mem, new_orth)
+        lex = doc.vocab.get(new_orth)
         token.lex = lex
         # We set trailing space here too
         token.spacy = doc.c[spans[token_index].end-1].spacy
@@ -360,7 +360,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
     cdef int idx_offset = 0
     for i, orth in enumerate(orths):
         token = &doc.c[token_index + i]
-        lex = doc.vocab.get(doc.mem, orth)
+        lex = doc.vocab.get(orth)
         token.lex = lex
         # If lemma is currently set, set default lemma to orth
         if token.lemma != 0:
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index b91ce3ab45b..f9e01b186b3 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -35,12 +35,11 @@ cdef class Vocab:
     cdef public object lex_attr_getters
     cdef public object cfg
 
-    cdef const LexemeC* get(self, Pool mem, str string) except NULL
-    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
+    cdef const LexemeC* get(self, str string) except NULL
+    cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL
     cdef const TokenC* make_fused_token(self, substrings) except NULL
 
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
+    cdef const LexemeC* _new_lexeme(self, str string) except NULL
     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
 
     cdef PreshMap _by_orth
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 834f21c35dc..8ac1215dead 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -145,7 +145,7 @@ cdef class Vocab:
         self.lex_attr_getters[flag_id] = flag_getter
         return flag_id
 
-    cdef const LexemeC* get(self, Pool mem, str string) except NULL:
+    cdef const LexemeC* get(self, str string) except NULL:
         """Get a pointer to a `LexemeC` from the lexicon, creating a new
         `Lexeme` if necessary using memory acquired from the given pool. If the
         pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@@ -162,9 +162,9 @@ cdef class Vocab:
                                                   orth=key, orth_id=string))
             return lex
         else:
-            return self._new_lexeme(mem, string)
+            return self._new_lexeme(string)
 
-    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
+    cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL:
         """Get a pointer to a `LexemeC` from the lexicon, creating a new
         `Lexeme` if necessary using memory acquired from the given pool. If the
         pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@@ -176,21 +176,10 @@ cdef class Vocab:
         if lex != NULL:
             return lex
         else:
-            return self._new_lexeme(mem, self.strings[orth])
-
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
-        # I think this heuristic is bad, and the Vocab should always
-        # own the lexemes. It avoids weird bugs this way, as it's how the thing
-        # was originally supposed to work. The best solution to the growing
-        # memory use is to periodically reset the vocab, which is an action
-        # that should be up to the user to do (so we don't need to keep track
-        # of the doc ownership).
-        # TODO: Change the C API so that the mem isn't passed in here.
-        mem = self.mem
-        # if len(string) < 3 or self.length < 10000:
-        #    mem = self.mem
-        cdef bint is_oov = mem is not self.mem
-        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
+            return self._new_lexeme(self.strings[orth])
+
+    cdef const LexemeC* _new_lexeme(self, str string) except NULL:
+        lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
         lex.orth = self.strings.add(string)
         lex.length = len(string)
         if self.vectors is not None and hasattr(self.vectors, "key2row"):
@@ -204,8 +193,7 @@ cdef class Vocab:
                     value = self.strings.add(value)
                 if value is not None:
                     Lexeme.set_struct_attr(lex, attr, value)
-        if not is_oov:
-            self._add_lex_to_vocab(lex.orth, lex)
+        self._add_lex_to_vocab(lex.orth, lex)
         if lex == NULL:
             raise ValueError(Errors.E085.format(string=string))
         return lex
@@ -276,7 +264,7 @@ cdef class Vocab:
             props = intify_attrs(props, strings_map=self.strings)
             token = &tokens[i]
             # Set the special tokens up to have arbitrary attributes
-            lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
+            lex = <LexemeC*>self.get_by_orth(props[ORTH])
             token.lex = lex
             for attr_id, value in props.items():
                 Token.set_struct_attr(token, attr_id, value)
diff --git a/website/docs/api/cython-classes.mdx b/website/docs/api/cython-classes.mdx
index ce7c03940ac..88bd92c723b 100644
--- a/website/docs/api/cython-classes.mdx
+++ b/website/docs/api/cython-classes.mdx
@@ -163,14 +163,13 @@ vocabulary.
 > #### Example
 >
 > ```python
-> lexeme = vocab.get(vocab.mem, "hello")
+> lexeme = vocab.get("hello")
 > ```
 
-| Name        | Description                                                                                                |
-| ----------- | ---------------------------------------------------------------------------------------------------------- |
-| `mem`       | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
-| `string`    | The string of the word to look up. ~~str~~                                                                 |
-| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~                                                          |
+| Name        | Description                                       |
+| ----------- | ------------------------------------------------- |
+| `string`    | The string of the word to look up. ~~str~~        |
+| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
 
 ### Vocab.get_by_orth {id="vocab_get_by_orth",tag="method"}
 
@@ -183,11 +182,10 @@ vocabulary.
 > lexeme = vocab.get_by_orth(doc[0].lex.norm)
 > ```
 
-| Name        | Description                                                                                                |
-| ----------- | ---------------------------------------------------------------------------------------------------------- |
-| `mem`       | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
-| `orth`      | ID of the verbatim text content. ~~attr_t (uint64_t)~~                                                     |
-| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~                                                          |
+| Name        | Description                                            |
+| ----------- | ------------------------------------------------------ |
+| `orth`      | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
+| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~      |
 
 ## StringStore {id="stringstore",tag="cdef class",source="spacy/strings.pxd"}
 

From ae90f7c7a343f338e2e8eb9db36cc221170837df Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 08:29:46 +0100
Subject: [PATCH 154/504] Format

---
 spacy/pipeline/edit_tree_lemmatizer.py |  2 +-
 spacy/pipeline/entity_linker.py        | 12 ++++++++++--
 spacy/pipeline/ner.py                  |  7 +++++--
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index d5169178b8c..a1bcb98455c 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -167,7 +167,7 @@ def get_teacher_student_loss(
         student_scores: Scores representing the student model's predictions.
 
         RETURNS (Tuple[float, float]): The loss and the gradient.
-        
+
         DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
         """
         loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index bab79282d5b..9c4312f6dd8 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -463,7 +463,11 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         docs_ents: List[Ragged] = []
         docs_scores: List[Ragged] = []
         if not docs:
-            return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
+            return {
+                KNOWLEDGE_BASE_IDS: final_kb_ids,
+                "ents": docs_ents,
+                "scores": docs_scores,
+            }
         if isinstance(docs, Doc):
             docs = [docs]
         for doc in docs:
@@ -565,7 +569,11 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                 method="predict", msg="result variables not of equal length"
             )
             raise RuntimeError(err)
-        return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
+        return {
+            KNOWLEDGE_BASE_IDS: final_kb_ids,
+            "ents": docs_ents,
+            "scores": docs_scores,
+        }
 
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of documents, using pre-computed scores.
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.py
index 4c2a3ac093c..2c5fd89cc5d 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.py
@@ -260,8 +260,11 @@ def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
     def labels(self):
         # Get the labels from the model by looking at the available moves, e.g.
         # B-PERSON, I-PERSON, L-PERSON, U-PERSON
-        labels = set(remove_bilu_prefix(move) for move in self.move_names
-                     if move[0] in ("B", "I", "L", "U"))
+        labels = set(
+            remove_bilu_prefix(move)
+            for move in self.move_names
+            if move[0] in ("B", "I", "L", "U")
+        )
         return tuple(sorted(labels))
 
     def scored_ents(self, beams):

From 327fbeacaf3dc16f91ed00c21cf4ded6c6239ecf Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 08:37:02 +0100
Subject: [PATCH 155/504] CI: Skip tests that require published pipelines

---
 .github/azure-steps.yml | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index c7722391fec..fc83d4994b4 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -64,12 +64,17 @@ steps:
     displayName: "Run GPU tests"
     condition: eq(${{ parameters.gpu }}, true)
 
-  - script: |
-      python -m spacy download ca_core_news_sm
-      python -m spacy download ca_core_news_md
-      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-    displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.8')
+#  - script: |
+#      python -m spacy download ca_core_news_sm
+#      python -m spacy download ca_core_news_md
+#      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+#    displayName: 'Test download CLI'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+#    displayName: 'Test no warnings on load (#11713)'
+#    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@@ -93,17 +98,17 @@ steps:
     displayName: 'Test train CLI'
     condition: eq(variables['python_version'], '3.8')
 
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-    displayName: 'Test assemble CLI'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-    displayName: 'Test assemble CLI vectors warning'
-    condition: eq(variables['python_version'], '3.8')
+#  - script: |
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+#      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+#    displayName: 'Test assemble CLI'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+#      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+#    displayName: 'Test assemble CLI vectors warning'
+#    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python .github/validate_universe_json.py website/meta/universe.json

From 3bc599a78b4147e37f789d2a4ac4e96a7bb36793 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 15:48:20 +0100
Subject: [PATCH 156/504] Drop python 3.6/3.7, remove unneeded compat (#12187)

* Drop python 3.6/3.7, remove unneeded compat

* Remove unused import

* Minimal python 3.8+ docs updates
---
 .pre-commit-config.yaml         |  2 +-
 CONTRIBUTING.md                 |  2 +-
 README.md                       |  2 +-
 azure-pipelines.yml             | 20 +----------
 requirements.txt                |  5 ++-
 setup.cfg                       |  6 ++--
 spacy/cli/_util.py              | 10 ++++++
 spacy/cli/debug_data.py         |  8 +++++
 spacy/compat.py                 | 13 -------
 spacy/errors.py                 |  3 +-
 spacy/language.py               | 61 +++++++++++++++------------------
 spacy/matcher/matcher.pyi       | 17 ++-------
 spacy/matcher/phrasematcher.pyi |  7 ++--
 spacy/ml/models/parser.py       |  5 +--
 spacy/pipeline/spancat.py       |  9 +++--
 spacy/schemas.py                |  9 +++++
 spacy/ty.py                     | 16 ++-------
 spacy/util.py                   | 11 +++---
 website/docs/usage/index.mdx    |  2 +-
 19 files changed, 87 insertions(+), 121 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e2c5e98fd97..8efe733f904 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
     rev: 22.3.0
     hooks:
     - id: black
-      language_version: python3.7
+      language_version: python3.8
       additional_dependencies: ['click==8.0.4']
 -   repo: https://github.com/pycqa/flake8
     rev: 5.0.4
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ed75e1fd8bd..b85ea8fcc4d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -276,7 +276,7 @@ except:  # noqa: E722
 
 ### Python conventions
 
-All Python code must be written **compatible with Python 3.6+**. More detailed
+All Python code must be written **compatible with Python 3.8+**. More detailed
 code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md).
 
 #### I/O and handling paths
diff --git a/README.md b/README.md
index afa96363b65..9e5c4be6898 100644
--- a/README.md
+++ b/README.md
@@ -115,7 +115,7 @@ For detailed installation instructions, see the
 
 - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
   Studio)
-- **Python version**: Python 3.7+ (only 64 bit)
+- **Python version**: Python 3.8+ (only 64 bit)
 - **Package managers**: [pip] · [conda] (via `conda-forge`)
 
 [pip]: https://pypi.org/project/spacy/
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 0f7ea91f96f..99f1b8afffe 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -29,7 +29,7 @@ jobs:
     steps:
       - task: UsePythonVersion@0
         inputs:
-          versionSpec: "3.7"
+          versionSpec: "3.8"
       - script: |
           pip install flake8==5.0.4
           python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
@@ -40,24 +40,6 @@ jobs:
     strategy:
       matrix:
         # We're only running one platform per Python version to speed up builds
-        Python36Linux:
-          imageName: "ubuntu-20.04"
-          python.version: "3.6"
-        #        Python36Windows:
-        #          imageName: "windows-latest"
-        #          python.version: "3.6"
-        #        Python36Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.6"
-        #        Python37Linux:
-        #          imageName: "ubuntu-20.04"
-        #          python.version: "3.7"
-        Python37Windows:
-          imageName: "windows-latest"
-          python.version: "3.7"
-        #        Python37Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.7"
         #        Python38Linux:
         #          imageName: "ubuntu-latest"
         #          python.version: "3.8"
diff --git a/requirements.txt b/requirements.txt
index 43f36f145ea..a63875eda6a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=3.0.11,<3.1.0
+spacy-legacy>=4.0.0.dev0,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
@@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
-typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
@@ -31,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
+mypy>=0.990,<0.1000; platform_machine != "aarch64"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
diff --git a/setup.cfg b/setup.cfg
index e8b47eebb9e..749ac0959ad 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -17,7 +17,6 @@ classifiers =
     Operating System :: Microsoft :: Windows
     Programming Language :: Cython
     Programming Language :: Python :: 3
-    Programming Language :: Python :: 3.7
     Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9
     Programming Language :: Python :: 3.10
@@ -30,10 +29,10 @@ project_urls =
 [options]
 zip_safe = false
 include_package_data = true
-python_requires = >=3.6
+python_requires = >=3.8
 install_requires =
     # Our libraries
-    spacy-legacy>=3.0.11,<3.1.0
+    spacy-legacy>=4.0.0.dev0,<4.1.0
     spacy-loggers>=1.0.0,<2.0.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
@@ -54,7 +53,6 @@ install_requires =
     # Official Python utilities
     setuptools
     packaging>=20.0
-    typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
     langcodes>=3.2.0,<4.0.0
 
 [options.entry_points]
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index fa41e6a08e0..ea91e64247d 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,3 +1,10 @@
+from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
+from typing import TYPE_CHECKING, overload
+import sys
+import shutil
+from pathlib import Path
+from wasabi import msg, Printer
+import srsly
 import hashlib
 import os
 import shutil
@@ -27,6 +34,9 @@
 from wasabi import Printer, msg
 from weasel import app as project_cli
 
+from ..schemas import ProjectConfigSchema, validate
+from ..util import import_file, run_command, make_tempdir, registry, logger
+from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
 from .. import about
 from ..compat import Literal
 from ..schemas import validate
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index af3c24f3ba9..c2253b0cb70 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,3 +1,11 @@
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
+from typing import Literal, cast, overload
+from pathlib import Path
+from collections import Counter
+import sys
+import srsly
+from wasabi import Printer, MESSAGES, msg
+import typer
 import math
 import sys
 from collections import Counter
diff --git a/spacy/compat.py b/spacy/compat.py
index 522fa30ddde..1e63807a0e8 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,19 +23,6 @@
 except ImportError:
     cupy = None
 
-if sys.version_info[:2] >= (3, 8):  # Python 3.8+
-    from typing import Literal, Protocol, runtime_checkable
-else:
-    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
-
-# Important note: The importlib_metadata "backport" includes functionality
-# that's not part of the built-in importlib.metadata. We should treat this
-# import like the built-in and only use what's available there.
-try:  # Python 3.8+
-    import importlib.metadata as importlib_metadata
-except ImportError:
-    from catalogue import _importlib_metadata as importlib_metadata  # type: ignore[no-redef]    # noqa: F401
-
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/errors.py b/spacy/errors.py
index 9074a3fead8..dcf8e60b7a1 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,7 +1,6 @@
+from typing import Literal
 import warnings
 
-from .compat import Literal
-
 
 class ErrorsWithCodes(type):
     def __getattribute__(self, code):
diff --git a/spacy/language.py b/spacy/language.py
index a47cc5df454..161d5b64884 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,3 +1,10 @@
+from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Literal
+from typing import Union, Tuple, List, Set, Pattern, Sequence
+from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
+
+from dataclasses import dataclass
+import random
+import itertools
 import functools
 import inspect
 import itertools
@@ -30,43 +37,29 @@
     overload,
 )
 
-import srsly
-from thinc.api import Config, CupyOps, Optimizer, get_current_ops
-
-from . import about, ty, util
-from .compat import Literal
+from . import ty
+from .tokens.underscore import Underscore
+from .vocab import Vocab, create_vocab
+from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
+from .training import Example, validate_examples
+from .training.initialize import init_vocab, init_tok2vec
+from .scorer import Scorer
+from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
+from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
+from .util import warn_if_jupyter_cupy
+from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
+from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .lang.punctuation import TOKENIZER_INFIXES
+from .tokens import Doc
+from .tokenizer import Tokenizer
 from .errors import Errors, Warnings
+from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
+from .schemas import ConfigSchemaPretrain, validate_init_settings
 from .git_info import GIT_VERSION
-from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
-from .lang.tokenizer_exceptions import BASE_EXCEPTIONS, URL_MATCH
+from . import util
+from . import about
 from .lookups import load_lookups
-from .pipe_analysis import analyze_pipes, print_pipe_analysis, validate_attrs
-from .schemas import (
-    ConfigSchema,
-    ConfigSchemaInit,
-    ConfigSchemaNlp,
-    ConfigSchemaPretrain,
-    validate_init_settings,
-)
-from .scorer import Scorer
-from .tokenizer import Tokenizer
-from .tokens import Doc
-from .tokens.underscore import Underscore
-from .training import Example, validate_examples
-from .training.initialize import init_tok2vec, init_vocab
-from .util import (
-    _DEFAULT_EMPTY_PIPES,
-    CONFIG_SECTION_ORDER,
-    SimpleFrozenDict,
-    SimpleFrozenList,
-    _pipe,
-    combine_score_weights,
-    raise_error,
-    registry,
-    warn_if_jupyter_cupy,
-)
-from .vectors import BaseVectors
-from .vocab import Vocab, create_vocab
+
 
 PipeCallable = Callable[[Doc], Doc]
 
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index c33b534cbd2..a0b6d91e7d5 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -1,17 +1,6 @@
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    Iterator,
-    List,
-    Optional,
-    Tuple,
-    Union,
-    overload,
-)
-
-from ..compat import Literal
+from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Literal
+from typing import Iterator, Iterable, overload
+from ..vocab import Vocab
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index f9585da7893..45685db228a 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -1,6 +1,7 @@
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, overload
-
-from ..compat import Literal
+from typing import List, Tuple, Union, Optional, Callable, Any, Dict, Literal
+from typing import overload
+from .matcher import Matcher
+from ..vocab import Vocab
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 from .matcher import Matcher
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 59483839206..01312983d86 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,12 +1,9 @@
-from typing import Optional, List, Tuple, Any
+from typing import Optional, List, Tuple, Any, Literal
 from thinc.types import Floats2d
 from thinc.api import Model
 import warnings
 
 from ...errors import Errors, Warnings
-from ...compat import Literal
-from ...errors import Errors
-from ...tokens import Doc
 from ...util import registry
 from ..tb_framework import TransitionModel
 from ...tokens.doc import Doc
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 1450bb5d6cb..bfaaf82e8d0 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,5 +1,5 @@
 from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
-from typing import Union
+from typing import Union, Protocol, runtime_checkable
 from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
 from thinc.api import Optimizer
 from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
@@ -8,7 +8,12 @@
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
 from thinc.types import Floats2d, Ints1d, Ints2d, Ragged
 
-from ..compat import Protocol, runtime_checkable
+from ..scorer import Scorer
+from ..language import Language
+from .trainable_pipe import TrainablePipe
+from ..tokens import Doc, SpanGroup, Span
+from ..vocab import Vocab
+from ..training import Example, validate_examples
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 9a2b5ed60e9..831f7df058f 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,3 +1,12 @@
+from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
+from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
+from enum import Enum
+from pydantic import BaseModel, Field, ValidationError, validator, create_model
+from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
+from pydantic.main import ModelMetaclass
+from thinc.api import Optimizer, ConfigValidationError, Model
+from thinc.config import Promise
+from collections import defaultdict
 import inspect
 import re
 from collections import defaultdict
diff --git a/spacy/ty.py b/spacy/ty.py
index f389456c03e..5a2b44aa583 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -1,17 +1,5 @@
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Sequence,
-)
-
-from thinc.api import Model, Optimizer
-
-from .compat import Protocol, runtime_checkable
+from typing import TYPE_CHECKING, Protocol, runtime_checkable
+from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
 
 if TYPE_CHECKING:
     from .language import Language
diff --git a/spacy/util.py b/spacy/util.py
index dedcd17ea58..de04ee6e718 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1,5 +1,6 @@
 import functools
 import importlib
+import importlib.metadata
 import importlib.util
 import re
 from pathlib import Path
@@ -70,7 +71,7 @@
 
 
 from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows, importlib_metadata
+from .compat import cupy, CudaStream, is_windows
 from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, importlib_metadata, is_windows
@@ -748,8 +749,8 @@ def get_package_version(name: str) -> Optional[str]:
     RETURNS (str / None): The version or None if package not installed.
     """
     try:
-        return importlib_metadata.version(name)  # type: ignore[attr-defined]
-    except importlib_metadata.PackageNotFoundError:  # type: ignore[attr-defined]
+        return importlib.metadata.version(name)  # type: ignore[attr-defined]
+    except importlib.metadata.PackageNotFoundError:  # type: ignore[attr-defined]
         return None
 
 
@@ -937,7 +938,7 @@ def is_package(name: str) -> bool:
     RETURNS (bool): True if installed package, False if not.
     """
     try:
-        importlib_metadata.distribution(name)  # type: ignore[attr-defined]
+        importlib.metadata.distribution(name)  # type: ignore[attr-defined]
         return True
     except:  # noqa: E722
         return False
@@ -1777,7 +1778,7 @@ def packages_distributions() -> Dict[str, List[str]]:
     it's not available in the builtin importlib.metadata.
     """
     pkg_to_dist = defaultdict(list)
-    for dist in importlib_metadata.distributions():
+    for dist in importlib.metadata.distributions():
         for pkg in (dist.read_text("top_level.txt") or "").split():
             pkg_to_dist[pkg].append(dist.metadata["Name"])
     return dict(pkg_to_dist)
diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx
index c50e9db6c6b..b8b4917f2b2 100644
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@@ -20,7 +20,7 @@ menu:
 
 ## Installation instructions {id="installation"}
 
-spaCy is compatible with **64-bit CPython 3.7+** and runs on **Unix/Linux**,
+spaCy is compatible with **64-bit CPython 3.8+** and runs on **Unix/Linux**,
 **macOS/OS X** and **Windows**. The latest spaCy releases are available over
 [pip](https://pypi.python.org/pypi/spacy) and
 [conda](https://anaconda.org/conda-forge/spacy).

From 6652879588014905a0b82b269b9f05149095f5e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 30 Jan 2023 12:44:11 +0100
Subject: [PATCH 157/504] Add `Language.distill` (#12116)

* Add `Language.distill`

This method is the distillation counterpart of `Language.update`.  It
takes a teacher `Language` instance and distills the student pipes on
the teacher pipes.

* Apply suggestions from code review

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Clarify that how Example is used in distillation

* Update transition parser distill docstring for examples argument

* Pass optimizer to `TrainablePipe.distill`

* Annotate pipe before update

As discussed internally, we want to let a pipe annotate before doing an
update with gold/silver data. Otherwise, the output may be (too)
informed by the gold/silver data.

* Rename `component_map` to `student_to_teacher`

* Better synopsis in `Language.distill` docstring

* `name` -> `student_name`

* Fix labels type in docstring

* Mark distill test as slow

* Fix `student_to_teacher` type in docs

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/language.py                       | 108 +++++++++++++++++++++++-
 spacy/pipeline/trainable_pipe.pyx       |   4 +-
 spacy/pipeline/transition_parser.pyx    |   4 +-
 spacy/tests/test_language.py            |  69 +++++++++++++++
 spacy/ty.py                             |  19 +++++
 website/docs/api/dependencyparser.mdx   |  18 ++--
 website/docs/api/edittreelemmatizer.mdx |  18 ++--
 website/docs/api/entityrecognizer.mdx   |  18 ++--
 website/docs/api/language.mdx           |  28 ++++++
 website/docs/api/morphologizer.mdx      |  18 ++--
 website/docs/api/pipe.mdx               |  18 ++--
 website/docs/api/sentencerecognizer.mdx |  18 ++--
 website/docs/api/tagger.mdx             |  18 ++--
 13 files changed, 290 insertions(+), 68 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 161d5b64884..8cd439d10b1 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -41,7 +41,7 @@
 from .tokens.underscore import Underscore
 from .vocab import Vocab, create_vocab
 from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .training import Example, validate_examples
+from .training import Example, validate_examples, validate_distillation_examples
 from .training.initialize import init_vocab, init_tok2vec
 from .scorer import Scorer
 from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
@@ -1049,6 +1049,102 @@ def __call__(
                 raise ValueError(Errors.E005.format(name=name, returned_type=type(doc)))
         return doc
 
+    def distill(
+        self,
+        teacher: "Language",
+        examples: Iterable[Example],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
+        exclude: Iterable[str] = SimpleFrozenList(),
+        annotates: Iterable[str] = SimpleFrozenList(),
+        student_to_teacher: Optional[Dict[str, str]] = None,
+    ):
+        """Distill the models in a student pipeline from a teacher pipeline.
+        teacher (Language): Teacher to distill from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
+        drop (float): The dropout rate.
+        sgd (Optional[Optimizer]): An optimizer.
+        losses (Optional(Dict[str, float])): Dictionary to update with the loss,
+            keyed by component.
+        component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
+            for specific pipeline components, keyed by component name.
+        exclude (Iterable[str]): Names of components that shouldn't be updated.
+        annotates (Iterable[str]): Names of components that should set
+            annotations on the predicted examples after updating.
+        student_to_teacher (Optional[Dict[str, str]]): Map student pipe name to
+            teacher pipe name, only needed for pipes where the student pipe
+            name does not match the teacher pipe name.
+        RETURNS (Dict[str, float]): The updated losses dictionary
+
+        DOCS: https://spacy.io/api/language#distill
+        """
+        if student_to_teacher is None:
+            student_to_teacher = {}
+        if losses is None:
+            losses = {}
+        if isinstance(examples, list) and len(examples) == 0:
+            return losses
+
+        validate_distillation_examples(examples, "Language.distill")
+        examples = _copy_examples(examples)
+
+        if sgd is None:
+            if self._optimizer is None:
+                self._optimizer = self.create_optimizer()
+            sgd = self._optimizer
+
+        if component_cfg is None:
+            component_cfg = {}
+        pipe_kwargs = {}
+        for student_name, student_proc in self.pipeline:
+            component_cfg.setdefault(student_name, {})
+            pipe_kwargs[student_name] = deepcopy(component_cfg[student_name])
+            component_cfg[student_name].setdefault("drop", drop)
+            pipe_kwargs[student_name].setdefault("batch_size", self.batch_size)
+
+        teacher_pipes = dict(teacher.pipeline)
+        for student_name, student_proc in self.pipeline:
+            if student_name in annotates:
+                for doc, eg in zip(
+                    _pipe(
+                        (eg.predicted for eg in examples),
+                        proc=student_proc,
+                        name=student_name,
+                        default_error_handler=self.default_error_handler,
+                        kwargs=pipe_kwargs[student_name],
+                    ),
+                    examples,
+                ):
+                    eg.predicted = doc
+
+            if (
+                student_name not in exclude
+                and isinstance(student_proc, ty.DistillableComponent)
+                and student_proc.is_distillable
+            ):
+                # A missing teacher pipe is not an error, some student pipes
+                # do not need a teacher, such as tok2vec layer losses.
+                teacher_name = (
+                    student_to_teacher[student_name]
+                    if student_name in student_to_teacher
+                    else student_name
+                )
+                teacher_pipe = teacher_pipes.get(teacher_name, None)
+                student_proc.distill(
+                    teacher_pipe,
+                    examples,
+                    sgd=sgd,
+                    losses=losses,
+                    **component_cfg[student_name],
+                )
+
+        return losses
+
     def disable_pipes(self, *names) -> "DisabledPipes":
         """Disable one or more pipeline components. If used as a context
         manager, the pipeline will be restored to the initial state at the end
@@ -1274,12 +1370,16 @@ def initialize(
         self,
         get_examples: Optional[Callable[[], Iterable[Example]]] = None,
         *,
+        labels: Optional[Dict[str, Any]] = None,
         sgd: Optional[Optimizer] = None,
     ) -> Optimizer:
         """Initialize the pipe for training, using data examples if available.
 
         get_examples (Callable[[], Iterable[Example]]): Optional function that
             returns gold-standard Example objects.
+        labels (Optional[Dict[str, Any]]): Labels to pass to pipe initialization,
+            using the names of the pipes as keys. Overrides labels that are in
+            the model configuration.
         sgd (Optional[Optimizer]): An optimizer to use for updates. If not
             provided, will be created using the .create_optimizer() method.
         RETURNS (thinc.api.Optimizer): The optimizer.
@@ -1327,6 +1427,8 @@ def get_examples():
         for name, proc in self.pipeline:
             if isinstance(proc, ty.InitializableComponent):
                 p_settings = I["components"].get(name, {})
+                if labels is not None and name in labels:
+                    p_settings["labels"] = labels[name]
                 p_settings = validate_init_settings(
                     proc.initialize, p_settings, section="components", name=name
                 )
@@ -1800,6 +1902,7 @@ def from_config(
         # using the nlp.config with all defaults.
         config = util.copy_config(config)
         orig_pipeline = config.pop("components", {})
+        orig_distill = config.pop("distill", None)
         orig_pretraining = config.pop("pretraining", None)
         config["components"] = {}
         if auto_fill:
@@ -1808,6 +1911,9 @@ def from_config(
             filled = config
         filled["components"] = orig_pipeline
         config["components"] = orig_pipeline
+        if orig_distill is not None:
+            filled["distill"] = orig_distill
+            config["distill"] = orig_distill
         if orig_pretraining is not None:
             filled["pretraining"] = orig_pretraining
             config["pretraining"] = orig_pretraining
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 3ec3e7551aa..97442a1aa97 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -74,8 +74,8 @@ cdef class TrainablePipe(Pipe):
         teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
             from.
         examples (Iterable[Example]): Distillation examples. The reference
-            and predicted docs must have the same number of tokens and the
-            same orthography.
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
         drop (float): dropout rate.
         sgd (Optional[Optimizer]): An optimizer. Will be created via
             create_optimizer if not set.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index d71a4ab0355..6a50dbacaeb 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -233,8 +233,8 @@ class Parser(TrainablePipe):
         teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
             from.
         examples (Iterable[Example]): Distillation examples. The reference
-            and predicted docs must have the same number of tokens and the
-            same orthography.
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
         drop (float): dropout rate.
         sgd (Optional[Optimizer]): An optimizer. Will be created via
             create_optimizer if not set.
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index d229739e1ee..8138cb157d2 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -28,6 +28,12 @@
     pass
 
 
+TAGGER_TRAIN_DATA = [
+    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
+
 def evil_component(doc):
     if "2" in doc.text:
         raise ValueError("no dice")
@@ -805,3 +811,66 @@ def bad_pipe(doc):
     nlp.add_pipe("test_component_bad_pipe")
     with pytest.raises(ValueError, match="instead of a Doc"):
         nlp("text")
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("teacher_tagger_name", ["tagger", "teacher_tagger"])
+def test_distill(teacher_tagger_name):
+    teacher = English()
+    teacher_tagger = teacher.add_pipe("tagger", name=teacher_tagger_name)
+    train_examples = []
+    for t in TAGGER_TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses[teacher_tagger_name] < 0.00001
+
+    student = English()
+    student_tagger = student.add_pipe("tagger")
+    student_tagger.min_tree_freq = 1
+    student_tagger.initialize(
+        get_examples=lambda: train_examples, labels=teacher_tagger.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TAGGER_TRAIN_DATA
+    ]
+
+    student_to_teacher = (
+        None
+        if teacher_tagger.name == student_tagger.name
+        else {student_tagger.name: teacher_tagger.name}
+    )
+
+    for i in range(50):
+        losses = {}
+        student.distill(
+            teacher,
+            distill_examples,
+            sgd=optimizer,
+            losses=losses,
+            student_to_teacher=student_to_teacher,
+        )
+    assert losses["tagger"] < 0.00001
+
+    test_text = "I like blue eggs"
+    doc = student(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+    # Do an extra update to check if annotates works, though we can't really
+    # validate the resuls, since the annotations are ephemeral.
+    student.distill(
+        teacher,
+        distill_examples,
+        sgd=optimizer,
+        losses=losses,
+        student_to_teacher=student_to_teacher,
+        annotates=["tagger"],
+    )
diff --git a/spacy/ty.py b/spacy/ty.py
index 5a2b44aa583..ac09cb336ac 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -25,6 +25,25 @@ def finish_update(self, sgd: Optimizer) -> None:
         ...
 
 
+@runtime_checkable
+class DistillableComponent(Protocol):
+    is_distillable: bool
+
+    def distill(
+        self,
+        teacher_pipe: Optional[TrainableComponent],
+        examples: Iterable["Example"],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None
+    ) -> Dict[str, float]:
+        ...
+
+    def finish_update(self, sgd: Optimizer) -> None:
+        ...
+
+
 @runtime_checkable
 class InitializableComponent(Protocol):
     def initialize(
diff --git a/website/docs/api/dependencyparser.mdx b/website/docs/api/dependencyparser.mdx
index 5179ce48b84..296d6d87da5 100644
--- a/website/docs/api/dependencyparser.mdx
+++ b/website/docs/api/dependencyparser.mdx
@@ -154,15 +154,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## DependencyParser.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx
index 2e099365758..c8b5c71806b 100644
--- a/website/docs/api/edittreelemmatizer.mdx
+++ b/website/docs/api/edittreelemmatizer.mdx
@@ -138,15 +138,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## EditTreeLemmatizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/entityrecognizer.mdx b/website/docs/api/entityrecognizer.mdx
index 005d5d11deb..f503cc998b0 100644
--- a/website/docs/api/entityrecognizer.mdx
+++ b/website/docs/api/entityrecognizer.mdx
@@ -150,15 +150,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## EntityRecognizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index d5fbae05ec4..2a1f7a1a961 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -333,6 +333,34 @@ and custom registered functions if needed. See the
 | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
 | **RETURNS**     | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                          |
 
+## Language.distill {id="distill",tag="method,experimental",version="4"}
+
+Distill the models in a student pipeline from a teacher pipeline.
+
+> #### Example
+>
+> ```python
+>
+> teacher = spacy.load("en_core_web_lg")
+> student = English()
+> student.add_pipe("tagger")
+> student.distill(teacher, examples, sgd=optimizer)
+> ```
+
+| Name                 | Description                                                                                                                                                                                 |
+| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher`            | The teacher pipeline to distill from. ~~Language~~                                                                                                                                          |
+| `examples`           | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_       |                                                                                                                                                                                             |
+| `drop`               | The dropout rate. ~~float~~                                                                                                                                                                 |
+| `sgd`                | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`             | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                                                             |
+| `component_cfg`      | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                              |
+| `exclude`            | Names of components that shouldn't be updated. Defaults to `[]`. ~~Iterable[str]~~                                                                                                          |
+| `annotates`          | Names of components that should set annotations on the prediced examples after updating. Defaults to `[]`. ~~Iterable[str]~~                                                                |
+| `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                              |
+| **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+
 ## Language.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 4f79458d319..4660ec312fa 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -144,15 +144,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## Morphologizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx
index 120c8f6908f..e1e7f5d7021 100644
--- a/website/docs/api/pipe.mdx
+++ b/website/docs/api/pipe.mdx
@@ -257,15 +257,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## TrainablePipe.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx
index 02fd57102e2..dfb7ed308ba 100644
--- a/website/docs/api/sentencerecognizer.mdx
+++ b/website/docs/api/sentencerecognizer.mdx
@@ -129,15 +129,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## SentenceRecognizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index 664fd7940c1..35e7a23b174 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -128,15 +128,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## Tagger.pipe {id="pipe",tag="method"}
 

From 46aaa4d3e1630704279867590e719f822a627611 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 31 Jan 2023 19:31:17 +0900
Subject: [PATCH 158/504] Don't re-download installed models (#12188)

* Don't re-download installed models

When downloading a model, this checks if the same version of the same
model is already installed. If it is then the download is skipped.

This is necessary because pip uses the final download URL for its
caching feature, but because of the way models are hosted on Github,
their URLs change every few minutes.

* Use importlib instead of meta.json

* Use get_package_version

* Add untested, disabled test

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 .github/azure-steps.yml |  5 +++++
 spacy/cli/download.py   | 11 ++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index fc83d4994b4..11dc7e295e4 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -74,6 +74,11 @@ steps:
 #  - script: |
 #      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
 #    displayName: 'Test no warnings on load (#11713)'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping
+#    displayName: 'Test skip re-download (#12188)'
 #    condition: eq(variables['python_version'], '3.8')
 
   - script: |
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index f371d110319..ea7a06c5417 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -8,7 +8,8 @@
 
 from .. import about
 from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version
+from ..util import is_prerelease_version, get_installed_models
+from ..util import get_package_version
 
 
 @app.command(
@@ -71,6 +72,14 @@ def download(
         compatibility = get_compatibility()
         version = get_version(model_name, compatibility)
 
+    # If we already have this version installed, skip downloading
+    installed = get_installed_models()
+    if model_name in installed:
+        installed_version = get_package_version(model_name)
+        if installed_version == version:
+            msg.warn(f"{model_name} v{version} already installed, skipping")
+            return
+
     filename = get_model_filename(model_name, version, sdist)
 
     download_model(filename, pip_args)

From 161e3788128934dddfe7c2cc18cb01e06c3be7af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 31 Jan 2023 13:06:02 +0100
Subject: [PATCH 159/504] Add the configuration schema for distillation
 (#12201)

* Add the configuration schema for distillation

This also adds the default configuration and some tests. The schema will
be used by the training loop and `distill` subcommand.

* Format

* Change distillation shortopt to -d

* Fix descripion of max_epochs

* Rename distillation flag to -dt

* Rename `pipe_map` to `student_to_teacher`
---
 spacy/cli/init_config.py                      | 15 +++-
 spacy/default_config_distillation.cfg         | 34 ++++++++
 spacy/language.py                             |  3 +
 spacy/schemas.py                              | 23 +++++
 .../tests/serialize/test_serialize_config.py  | 85 +++++++++++++++----
 5 files changed, 143 insertions(+), 17 deletions(-)
 create mode 100644 spacy/default_config_distillation.cfg

diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index a7c03d00f90..129b5a24e84 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -9,7 +9,7 @@
 from wasabi import Printer, diff_strings
 
 from .. import util
-from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
+from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
 from ._util import (
@@ -90,6 +90,7 @@ def init_fill_config_cli(
     # fmt: off
     base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
     output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
+    distillation: bool = Opt(False, "--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"),
     pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
     diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
     code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
@@ -105,13 +106,20 @@ def init_fill_config_cli(
     DOCS: https://spacy.io/api/cli#init-fill-config
     """
     import_code(code_path)
-    fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
+    fill_config(
+        output_file,
+        base_path,
+        distillation=distillation,
+        pretraining=pretraining,
+        diff=diff,
+    )
 
 
 def fill_config(
     output_file: Path,
     base_path: Path,
     *,
+    distillation: bool = False,
     pretraining: bool = False,
     diff: bool = False,
     silent: bool = False,
@@ -130,6 +138,9 @@ def fill_config(
     # replaced with their actual config after loading, so we have to re-add them
     sourced = util.get_sourced_components(config)
     filled["components"].update(sourced)
+    if distillation:
+        distillation_config = util.load_config(DEFAULT_CONFIG_DISTILL_PATH)
+        filled = distillation_config.merge(filled)
     if pretraining:
         validate_config_for_pretrain(filled, msg)
         pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
diff --git a/spacy/default_config_distillation.cfg b/spacy/default_config_distillation.cfg
new file mode 100644
index 00000000000..1926fafa961
--- /dev/null
+++ b/spacy/default_config_distillation.cfg
@@ -0,0 +1,34 @@
+[paths]
+raw_text = null
+
+[distillation]
+corpus = "corpora.distillation"
+dropout = 0.1
+max_epochs = 1
+max_steps = 0
+student_to_teacher = {}
+
+[distillation.batcher]
+@batchers = "spacy.batch_by_words.v1"
+size = 3000
+discard_oversize = false
+tolerance = 0.2
+
+[distillation.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 1e-4
+
+[corpora]
+
+[corpora.distillation]
+@readers = "spacy.PlainTextCorpus.v1"
+path = ${paths.raw_text}
+min_length = 0
+max_length = 0
diff --git a/spacy/language.py b/spacy/language.py
index 8cd439d10b1..a1fa61d0923 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -67,6 +67,9 @@
 # This is the base config will all settings (training etc.)
 DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
 DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
+# This is the base config for the [distillation] block and currently not included
+# in the main config and only added via the 'init fill-config' command
+DEFAULT_CONFIG_DISTILL_PATH = Path(__file__).parent / "default_config_distillation.cfg"
 # This is the base config for the [pretraining] block and currently not included
 # in the main config and only added via the 'init fill-config' command
 DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 831f7df058f..32fb042b5a0 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -470,6 +470,27 @@ class Config:
         arbitrary_types_allowed = True
 
 
+class ConfigSchemaDistillEmpty(BaseModel):
+    class Config:
+        extra = "forbid"
+
+
+class ConfigSchemaDistill(BaseModel):
+    # fmt: off
+    batcher: Batcher = Field(..., title="Batcher for the training data")
+    corpus: StrictStr = Field(..., title="Path in the config to the distillation data")
+    dropout: StrictFloat = Field(..., title="Dropout rate")
+    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to distill for")
+    max_steps: StrictInt = Field(..., title="Maximum number of steps to distill for")
+    optimizer: Optimizer = Field(..., title="The optimizer to use")
+    student_to_teacher: Dict[str, str] = Field(..., title="Mapping from student to teacher pipe")
+    # fmt: on
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
 class ConfigSchema(BaseModel):
     training: ConfigSchemaTraining
     nlp: ConfigSchemaNlp
@@ -477,6 +498,7 @@ class ConfigSchema(BaseModel):
     components: Dict[str, Dict[str, Any]]
     corpora: Dict[str, Reader]
     initialize: ConfigSchemaInit
+    distillation: Union[ConfigSchemaDistill, ConfigSchemaDistillEmpty] = {}  # type: ignore[assignment]
 
     class Config:
         extra = "allow"
@@ -488,6 +510,7 @@ class Config:
     "training": ConfigSchemaTraining,
     "pretraining": ConfigSchemaPretrain,
     "initialize": ConfigSchemaInit,
+    "distill": ConfigSchemaDistill,
 }
 
 # Recommendations for init config workflows
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index dd0a53c910e..eb0dcc1e38c 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -5,21 +5,14 @@
 import spacy
 from spacy.lang.de import German
 from spacy.lang.en import English
-from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH, Language
-from spacy.ml.models import (
-    MaxoutWindowEncoder,
-    MultiHashEmbed,
-    build_tb_parser_model,
-    build_Tok2Vec_model,
-)
-from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
-from spacy.training import Example
-from spacy.util import (
-    load_config,
-    load_config_from_str,
-    load_model_from_config,
-    registry,
-)
+from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
+from spacy.language import DEFAULT_CONFIG_DISTILL_PATH
+from spacy.language import Language
+from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
+from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
+from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
+from spacy.util import load_config, load_config_from_str
+from spacy.util import load_model_from_config, registry
 
 from ..util import make_tempdir
 
@@ -74,6 +67,60 @@
 width = ${components.tok2vec.model.width}
 """
 
+distill_config_string = """
+[paths]
+train = null
+dev = null
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+
+[training]
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+size = 666
+
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 342
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
+subword_features = true
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v2"
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.width}
+
+[distill]
+"""
+
+
 pretrain_config_string = """
 [paths]
 train = null
@@ -209,6 +256,14 @@ def test_create_nlp_from_config():
         load_model_from_config(Config(bad_cfg), auto_fill=True)
 
 
+def test_nlp_from_distillation_config():
+    """Test that the default distillation config validates properly"""
+    config = Config().from_str(distill_config_string)
+    distill_config = load_config(DEFAULT_CONFIG_DISTILL_PATH)
+    filled = config.merge(distill_config)
+    registry.resolve(filled["distillation"], schema=ConfigSchemaDistill)
+
+
 def test_create_nlp_from_pretraining_config():
     """Test that the default pretraining config validates properly"""
     config = Config().from_str(pretrain_config_string)

From 1969666fcf37d9b7580fb027b7feda49378c6bac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 31 Jan 2023 13:19:42 +0100
Subject: [PATCH 160/504] Language.distill: copy both reference and predicted
 (#12209)

* Language.distill: copy both reference and predicted

In distillation we also modify the teacher docs (e.g. in tok2vec
components), so we need to copy both the reference and predicted doc.

Problem caught by @shadeMe

* Make new `_copy_examples` args kwonly
---
 spacy/language.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index a1fa61d0923..cb9652e97bf 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1094,7 +1094,7 @@ def distill(
             return losses
 
         validate_distillation_examples(examples, "Language.distill")
-        examples = _copy_examples(examples)
+        examples = _copy_examples(examples, copy_x=True, copy_y=True)
 
         if sgd is None:
             if self._optimizer is None:
@@ -2409,13 +2409,18 @@ def restore(self) -> None:
         self[:] = []
 
 
-def _copy_examples(examples: Iterable[Example]) -> List[Example]:
+def _copy_examples(
+    examples: Iterable[Example], *, copy_x: bool = True, copy_y: bool = False
+) -> List[Example]:
     """Make a copy of a batch of examples, copying the predicted Doc as well.
     This is used in contexts where we need to take ownership of the examples
     so that they can be mutated, for instance during Language.evaluate and
     Language.update.
     """
-    return [Example(eg.x.copy(), eg.y) for eg in examples]
+    return [
+        Example(eg.x.copy() if copy_x else eg.x, eg.y.copy() if copy_y else eg.y)
+        for eg in examples
+    ]
 
 
 def _apply_pipes(

From 100aa8318cf70a0337119ef91e5b23e11b8c20eb Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Tue, 31 Jan 2023 17:30:43 +0100
Subject: [PATCH 161/504] Rename language codes (Icelandic, multi-language)
 (#12149)

* Init

* fix tests

* Update spacy/errors.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Fix test_blank_languages

* Rename xx to mul in docs

* Format _util with black

* prettier formatting

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/_util.py                            | 153 ++++++++++++++++++
 spacy/cli/convert.py                          |   6 +
 spacy/cli/init_config.py                      |  18 +--
 spacy/cli/init_pipeline.py                    |  19 +--
 spacy/errors.py                               |   1 +
 spacy/lang/{is => isl}/__init__.py            |   2 +-
 spacy/lang/{is => isl}/stop_words.py          |   0
 spacy/lang/{xx => mul}/__init__.py            |   4 +-
 spacy/lang/{xx => mul}/examples.py            |   0
 spacy/scorer.py                               |   2 +-
 spacy/tests/README.md                         |   2 +-
 spacy/tests/conftest.py                       |  10 +-
 spacy/tests/doc/test_doc_api.py               |   2 +-
 spacy/tests/lang/{is => isl}/__init__.py      |   0
 spacy/tests/lang/{is => isl}/test_text.py     |   8 +-
 .../tests/lang/{is => isl}/test_tokenizer.py  |   8 +-
 spacy/tests/lang/{xx => mul}/__init__.py      |   0
 spacy/tests/lang/{xx => mul}/test_text.py     |   4 +-
 .../tests/lang/{xx => mul}/test_tokenizer.py  |   8 +-
 spacy/tests/lang/test_initialize.py           |   6 +-
 spacy/tests/pipeline/test_span_ruler.py       |  52 +++---
 spacy/tests/test_language.py                  |   9 +-
 spacy/tests/tokenizer/test_explain.py         |   1 +
 .../training/converters/conll_ner_to_docs.py  |   4 +-
 spacy/training/converters/json_to_docs.py     |  12 +-
 spacy/util.py                                 |   8 +-
 website/docs/api/scorer.mdx                   |   2 +-
 website/docs/usage/models.mdx                 |  12 +-
 website/meta/languages.json                   |   6 +-
 website/src/widgets/quickstart-models.js      |   2 +-
 30 files changed, 254 insertions(+), 107 deletions(-)
 rename spacy/lang/{is => isl}/__init__.py (93%)
 rename spacy/lang/{is => isl}/stop_words.py (100%)
 rename spacy/lang/{xx => mul}/__init__.py (67%)
 rename spacy/lang/{xx => mul}/examples.py (100%)
 rename spacy/tests/lang/{is => isl}/__init__.py (100%)
 rename spacy/tests/lang/{is => isl}/test_text.py (85%)
 rename spacy/tests/lang/{is => isl}/test_tokenizer.py (72%)
 rename spacy/tests/lang/{xx => mul}/__init__.py (100%)
 rename spacy/tests/lang/{xx => mul}/test_text.py (96%)
 rename spacy/tests/lang/{xx => mul}/test_tokenizer.py (68%)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index ea91e64247d..52a70cc7320 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -37,6 +37,7 @@
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
 from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
+from ..errors import RENAMED_LANGUAGE_CODES
 from .. import about
 from ..compat import Literal
 from ..schemas import validate
@@ -158,6 +159,158 @@ def _parse_override(value: Any) -> Any:
         return str(value)
 
 
+def _handle_renamed_language_codes(lang: Optional[str]) -> None:
+    # Throw error for renamed language codes in v4
+    if lang in RENAMED_LANGUAGE_CODES:
+        msg.fail(
+            title="Renamed language code",
+            text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
+            exits=1,
+        )
+
+
+def load_project_config(
+    path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
+) -> Dict[str, Any]:
+    """Load the project.yml file from a directory and validate it. Also make
+    sure that all directories defined in the config exist.
+
+    path (Path): The path to the project directory.
+    interpolate (bool): Whether to substitute project variables.
+    overrides (Dict[str, Any]): Optional config overrides.
+    RETURNS (Dict[str, Any]): The loaded project.yml.
+    """
+    config_path = path / PROJECT_FILE
+    if not config_path.exists():
+        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
+    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
+    try:
+        config = srsly.read_yaml(config_path)
+    except ValueError as e:
+        msg.fail(invalid_err, e, exits=1)
+    errors = validate(ProjectConfigSchema, config)
+    if errors:
+        msg.fail(invalid_err)
+        print("\n".join(errors))
+        sys.exit(1)
+    validate_project_version(config)
+    validate_project_commands(config)
+    if interpolate:
+        err = f"{PROJECT_FILE} validation error"
+        with show_validation_error(title=err, hint_fill=False):
+            config = substitute_project_variables(config, overrides)
+    # Make sure directories defined in config exist
+    for subdir in config.get("directories", []):
+        dir_path = path / subdir
+        if not dir_path.exists():
+            dir_path.mkdir(parents=True)
+    return config
+
+
+def substitute_project_variables(
+    config: Dict[str, Any],
+    overrides: Dict[str, Any] = SimpleFrozenDict(),
+    key: str = "vars",
+    env_key: str = "env",
+) -> Dict[str, Any]:
+    """Interpolate variables in the project file using the config system.
+
+    config (Dict[str, Any]): The project config.
+    overrides (Dict[str, Any]): Optional config overrides.
+    key (str): Key containing variables in project config.
+    env_key (str): Key containing environment variable mapping in project config.
+    RETURNS (Dict[str, Any]): The interpolated project config.
+    """
+    config.setdefault(key, {})
+    config.setdefault(env_key, {})
+    # Substitute references to env vars with their values
+    for config_var, env_var in config[env_key].items():
+        config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
+    # Need to put variables in the top scope again so we can have a top-level
+    # section "project" (otherwise, a list of commands in the top scope wouldn't)
+    # be allowed by Thinc's config system
+    cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
+    cfg = Config().from_str(cfg.to_str(), overrides=overrides)
+    interpolated = cfg.interpolate()
+    return dict(interpolated["project"])
+
+
+def validate_project_version(config: Dict[str, Any]) -> None:
+    """If the project defines a compatible spaCy version range, chec that it's
+    compatible with the current version of spaCy.
+
+    config (Dict[str, Any]): The loaded config.
+    """
+    spacy_version = config.get("spacy_version", None)
+    if spacy_version and not is_compatible_version(about.__version__, spacy_version):
+        err = (
+            f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
+            f"that's not compatible with the version of spaCy you're running "
+            f"({about.__version__}). You can edit version requirement in the "
+            f"{PROJECT_FILE} to load it, but the project may not run as expected."
+        )
+        msg.fail(err, exits=1)
+
+
+def validate_project_commands(config: Dict[str, Any]) -> None:
+    """Check that project commands and workflows are valid, don't contain
+    duplicates, don't clash  and only refer to commands that exist.
+
+    config (Dict[str, Any]): The loaded config.
+    """
+    command_names = [cmd["name"] for cmd in config.get("commands", [])]
+    workflows = config.get("workflows", {})
+    duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
+    if duplicates:
+        err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
+        msg.fail(err, exits=1)
+    for workflow_name, workflow_steps in workflows.items():
+        if workflow_name in command_names:
+            err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
+            msg.fail(err, exits=1)
+        for step in workflow_steps:
+            if step not in command_names:
+                msg.fail(
+                    f"Unknown command specified in workflow '{workflow_name}': {step}",
+                    f"Workflows can only refer to commands defined in the 'commands' "
+                    f"section of the {PROJECT_FILE}.",
+                    exits=1,
+                )
+
+
+def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
+    """Get the hash for a JSON-serializable object.
+
+    data: The data to hash.
+    exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
+    RETURNS (str): The hash.
+    """
+    if isinstance(data, dict):
+        data = {k: v for k, v in data.items() if k not in exclude}
+    data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
+    return hashlib.md5(data_str).hexdigest()
+
+
+def get_checksum(path: Union[Path, str]) -> str:
+    """Get the checksum for a file or directory given its file path. If a
+    directory path is provided, this uses all files in that directory.
+
+    path (Union[Path, str]): The file or directory path.
+    RETURNS (str): The checksum.
+    """
+    path = Path(path)
+    if not (path.is_file() or path.is_dir()):
+        msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
+    if path.is_file():
+        return hashlib.md5(Path(path).read_bytes()).hexdigest()
+    else:
+        # TODO: this is currently pretty slow
+        dir_checksum = hashlib.md5()
+        for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
+            dir_checksum.update(sub_file.read_bytes())
+        return dir_checksum.hexdigest()
+
+
 @contextmanager
 def show_validation_error(
     file_path: Optional[Union[str, Path]] = None,
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index a66a68133b3..3844b340678 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -8,6 +8,8 @@
 import srsly
 from wasabi import Printer
 
+from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
+from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training import docs_to_json
 from ..training.converters import (
@@ -116,6 +118,10 @@ def convert(
     input_path = Path(input_path)
     if not msg:
         msg = Printer(no_print=silent)
+
+    # Throw error for renamed language codes in v4
+    _handle_renamed_language_codes(lang)
+
     ner_map = srsly.read_json(ner_map) if ner_map is not None else None
     doc_files = []
     for input_loc in walk_directory(input_path, converter):
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index 129b5a24e84..b29a2b748f2 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -12,15 +12,9 @@
 from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
-from ._util import (
-    COMMAND,
-    Arg,
-    Opt,
-    import_code,
-    init_cli,
-    show_validation_error,
-    string_to_list,
-)
+from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
+from ._util import string_to_list, import_code, _handle_renamed_language_codes
+
 
 ROOT = Path(__file__).parent / "templates"
 TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
@@ -50,7 +44,7 @@ class InitValues:
 def init_config_cli(
     # fmt: off
     output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
-    lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
+    lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"),
     pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
     optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
     gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
@@ -176,6 +170,10 @@ def init_config(
     msg = Printer(no_print=silent)
     with TEMPLATE_PATH.open("r") as f:
         template = Template(f.read())
+
+    # Throw error for renamed language codes in v4
+    _handle_renamed_language_codes(lang)
+
     # Filter out duplicates since tok2vec and transformer are added by template
     pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
     defaults = RECOMMENDATIONS["__default__"]
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 21eea8edf2f..0ff39d2145b 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -8,16 +8,8 @@
 
 from .. import util
 from ..language import Language
-from ..training.initialize import convert_vectors, init_nlp
-from ._util import (
-    Arg,
-    Opt,
-    import_code,
-    init_cli,
-    parse_config_overrides,
-    setup_gpu,
-    show_validation_error,
-)
+from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code, setup_gpu, _handle_renamed_language_codes
 
 
 @init_cli.command("vectors")
@@ -39,8 +31,11 @@ def init_vectors_cli(
     you can use in the [initialize] block of your config to initialize
     a model with vectors.
     """
-    if verbose:
-        util.logger.setLevel(logging.DEBUG)
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+
+    # Throw error for renamed language codes in v4
+    _handle_renamed_language_codes(lang)
+
     msg.info(f"Creating blank nlp object for language '{lang}'")
     nlp = util.get_lang_class(lang)()
     if jsonl_loc is not None:
diff --git a/spacy/errors.py b/spacy/errors.py
index dcf8e60b7a1..c8c595395b3 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -981,6 +981,7 @@ class Errors(metaclass=ErrorsWithCodes):
              "reference and predicted docs.")
     E4004 = ("Backprop is not supported when is_train is not set.")
 
+RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
 # fmt: on
 
diff --git a/spacy/lang/is/__init__.py b/spacy/lang/isl/__init__.py
similarity index 93%
rename from spacy/lang/is/__init__.py
rename to spacy/lang/isl/__init__.py
index af126004536..50929620ced 100644
--- a/spacy/lang/is/__init__.py
+++ b/spacy/lang/isl/__init__.py
@@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults):
 
 
 class Icelandic(Language):
-    lang = "is"
+    lang = "isl"
     Defaults = IcelandicDefaults
 
 
diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/isl/stop_words.py
similarity index 100%
rename from spacy/lang/is/stop_words.py
rename to spacy/lang/isl/stop_words.py
diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/mul/__init__.py
similarity index 67%
rename from spacy/lang/xx/__init__.py
rename to spacy/lang/mul/__init__.py
index aff8403ffc7..5170f1e861f 100644
--- a/spacy/lang/xx/__init__.py
+++ b/spacy/lang/mul/__init__.py
@@ -3,10 +3,10 @@
 
 class MultiLanguage(Language):
     """Language class to be used for models that support multiple languages.
-    This module allows models to specify their language ID as 'xx'.
+    This module allows models to specify their language ID as 'mul'.
     """
 
-    lang = "xx"
+    lang = "mul"
 
 
 __all__ = ["MultiLanguage"]
diff --git a/spacy/lang/xx/examples.py b/spacy/lang/mul/examples.py
similarity index 100%
rename from spacy/lang/xx/examples.py
rename to spacy/lang/mul/examples.py
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 9ab116deb3f..b590f86337e 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -114,7 +114,7 @@ class Scorer:
     def __init__(
         self,
         nlp: Optional["Language"] = None,
-        default_lang: str = "xx",
+        default_lang: str = "mul",
         default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
         **cfg,
     ) -> None:
diff --git a/spacy/tests/README.md b/spacy/tests/README.md
index f3c96a39e7c..9ac1e6d2e34 100644
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@@ -86,7 +86,7 @@ These are the main fixtures that are currently available:
 
 | Fixture                             | Description                                                                  |
 | ----------------------------------- | ---------------------------------------------------------------------------- |
-| `tokenizer`                         | Basic, language-independent tokenizer. Identical to the `xx` language class. |
+| `tokenizer`                         | Basic, language-independent tokenizer. Identical to the `mul` language class. |
 | `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer.                                   |
 | `en_vocab`                          | Creates an instance of the English `Vocab`.                                  |
 
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 6085b89cf02..fdc9f192c2f 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -85,7 +85,7 @@ def register_cython_tests(cython_mod_name: str, test_mod_name: str):
 
 @pytest.fixture(scope="module")
 def tokenizer():
-    return get_lang_class("xx")().tokenizer
+    return get_lang_class("mul")().tokenizer
 
 
 @pytest.fixture(scope="session")
@@ -250,8 +250,8 @@ def id_tokenizer():
 
 
 @pytest.fixture(scope="session")
-def is_tokenizer():
-    return get_lang_class("is")().tokenizer
+def isl_tokenizer():
+    return get_lang_class("isl")().tokenizer
 
 
 @pytest.fixture(scope="session")
@@ -513,8 +513,8 @@ def vi_tokenizer():
 
 
 @pytest.fixture(scope="session")
-def xx_tokenizer():
-    return get_lang_class("xx")().tokenizer
+def mul_tokenizer():
+    return get_lang_class("mul")().tokenizer
 
 
 @pytest.fixture(scope="session")
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 946910b29e1..518db02e6b3 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -18,7 +18,7 @@
     TAG,
 )
 from spacy.lang.en import English
-from spacy.lang.xx import MultiLanguage
+from spacy.lang.mul import MultiLanguage
 from spacy.language import Language
 from spacy.lexeme import Lexeme
 from spacy.tokens import Doc, Span, SpanGroup, Token
diff --git a/spacy/tests/lang/is/__init__.py b/spacy/tests/lang/isl/__init__.py
similarity index 100%
rename from spacy/tests/lang/is/__init__.py
rename to spacy/tests/lang/isl/__init__.py
diff --git a/spacy/tests/lang/is/test_text.py b/spacy/tests/lang/isl/test_text.py
similarity index 85%
rename from spacy/tests/lang/is/test_text.py
rename to spacy/tests/lang/isl/test_text.py
index 6e3654a6eda..9e177485d09 100644
--- a/spacy/tests/lang/is/test_text.py
+++ b/spacy/tests/lang/isl/test_text.py
@@ -1,7 +1,7 @@
 import pytest
 
 
-def test_long_text(is_tokenizer):
+def test_long_text(isl_tokenizer):
     # Excerpt: European Convention on Human Rights
     text = """
 hafa í huga, að yfirlýsing þessi hefur það markmið að tryggja
@@ -15,12 +15,12 @@ def test_long_text(is_tokenizer):
 virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi
 og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins;
 """
-    tokens = is_tokenizer(text)
+    tokens = isl_tokenizer(text)
     assert len(tokens) == 120
 
 
 @pytest.mark.xfail
-def test_ordinal_number(is_tokenizer):
+def test_ordinal_number(isl_tokenizer):
     text = "10. desember 1948"
-    tokens = is_tokenizer(text)
+    tokens = isl_tokenizer(text)
     assert len(tokens) == 3
diff --git a/spacy/tests/lang/is/test_tokenizer.py b/spacy/tests/lang/isl/test_tokenizer.py
similarity index 72%
rename from spacy/tests/lang/is/test_tokenizer.py
rename to spacy/tests/lang/isl/test_tokenizer.py
index 0c05a605001..ba534aaf662 100644
--- a/spacy/tests/lang/is/test_tokenizer.py
+++ b/spacy/tests/lang/isl/test_tokenizer.py
@@ -1,6 +1,6 @@
 import pytest
 
-IS_BASIC_TOKENIZATION_TESTS = [
+ISL_BASIC_TOKENIZATION_TESTS = [
     (
         "Enginn maður skal sæta pyndingum eða ómannlegri eða "
         "vanvirðandi meðferð eða refsingu. ",
@@ -23,8 +23,8 @@
 ]
 
 
-@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS)
-def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens):
-    tokens = is_tokenizer(text)
+@pytest.mark.parametrize("text,expected_tokens", ISL_BASIC_TOKENIZATION_TESTS)
+def test_isl_tokenizer_basic(isl_tokenizer, text, expected_tokens):
+    tokens = isl_tokenizer(text)
     token_list = [token.text for token in tokens if not token.is_space]
     assert expected_tokens == token_list
diff --git a/spacy/tests/lang/xx/__init__.py b/spacy/tests/lang/mul/__init__.py
similarity index 100%
rename from spacy/tests/lang/xx/__init__.py
rename to spacy/tests/lang/mul/__init__.py
diff --git a/spacy/tests/lang/xx/test_text.py b/spacy/tests/lang/mul/test_text.py
similarity index 96%
rename from spacy/tests/lang/xx/test_text.py
rename to spacy/tests/lang/mul/test_text.py
index 477f0ebe271..6e4262d6696 100644
--- a/spacy/tests/lang/xx/test_text.py
+++ b/spacy/tests/lang/mul/test_text.py
@@ -1,7 +1,7 @@
 import pytest
 
 
-def test_long_text(xx_tokenizer):
+def test_long_text(mul_tokenizer):
     # Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi
     text = """
 Säʹmmla lie Euroopp unioon oʹdinakai alggmeer. Säʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest.  
@@ -20,5 +20,5 @@ def test_long_text(xx_tokenizer):
 Sääʹmteʹǧǧ.
 """
 
-    tokens = xx_tokenizer(text)
+    tokens = mul_tokenizer(text)
     assert len(tokens) == 179
diff --git a/spacy/tests/lang/xx/test_tokenizer.py b/spacy/tests/lang/mul/test_tokenizer.py
similarity index 68%
rename from spacy/tests/lang/xx/test_tokenizer.py
rename to spacy/tests/lang/mul/test_tokenizer.py
index 15c760a6b85..3d06dc11cf7 100644
--- a/spacy/tests/lang/xx/test_tokenizer.py
+++ b/spacy/tests/lang/mul/test_tokenizer.py
@@ -1,6 +1,6 @@
 import pytest
 
-XX_BASIC_TOKENIZATION_TESTS = [
+MUL_BASIC_TOKENIZATION_TESTS = [
     (
         "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
         [
@@ -18,8 +18,8 @@
 ]
 
 
-@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS)
-def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens):
-    tokens = xx_tokenizer(text)
+@pytest.mark.parametrize("text,expected_tokens", MUL_BASIC_TOKENIZATION_TESTS)
+def test_mul_tokenizer_basic(mul_tokenizer, text, expected_tokens):
+    tokens = mul_tokenizer(text)
     token_list = [token.text for token in tokens if not token.is_space]
     assert expected_tokens == token_list
diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py
index 8a158647a69..e0fd534d317 100644
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@@ -7,10 +7,10 @@
 # excluded: ja, ko, th, vi, zh
 LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
              "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
-             "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
-             "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
+             "hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv",
+             "mk", "ml", "mr", "mul", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
              "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
-             "tr", "tt", "uk", "ur", "xx", "yo"]
+             "tr", "tt", "uk", "ur", "yo"]
 # fmt: on
 
 
diff --git a/spacy/tests/pipeline/test_span_ruler.py b/spacy/tests/pipeline/test_span_ruler.py
index 0a8616f449b..3dfbccf28e2 100644
--- a/spacy/tests/pipeline/test_span_ruler.py
+++ b/spacy/tests/pipeline/test_span_ruler.py
@@ -46,7 +46,7 @@ def person_org_date_patterns(person_org_patterns):
 
 def test_span_ruler_add_empty(patterns):
     """Test that patterns don't get added excessively."""
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"validate": True})
     ruler.add_patterns(patterns)
     pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
@@ -57,7 +57,7 @@ def test_span_ruler_add_empty(patterns):
 
 
 def test_span_ruler_init(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
@@ -73,7 +73,7 @@ def test_span_ruler_init(patterns):
 
 
 def test_span_ruler_no_patterns_warns():
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     assert len(ruler) == 0
     assert len(ruler.labels) == 0
@@ -85,7 +85,7 @@ def test_span_ruler_no_patterns_warns():
 
 def test_span_ruler_init_patterns(patterns):
     # initialize with patterns
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     assert len(ruler.labels) == 0
     ruler.initialize(lambda: [], patterns=patterns)
@@ -109,7 +109,7 @@ def test_span_ruler_init_patterns(patterns):
 
 def test_span_ruler_init_clear(patterns):
     """Test that initialization clears patterns."""
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler.labels) == 4
@@ -118,7 +118,7 @@ def test_span_ruler_init_clear(patterns):
 
 
 def test_span_ruler_clear(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler.labels) == 4
@@ -132,7 +132,7 @@ def test_span_ruler_clear(patterns):
 
 
 def test_span_ruler_existing(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"overwrite": False})
     ruler.add_patterns(patterns)
     doc = nlp.make_doc("OH HELLO WORLD bye bye")
@@ -147,7 +147,7 @@ def test_span_ruler_existing(patterns):
 
 
 def test_span_ruler_existing_overwrite(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
     ruler.add_patterns(patterns)
     doc = nlp.make_doc("OH HELLO WORLD bye bye")
@@ -160,13 +160,13 @@ def test_span_ruler_existing_overwrite(patterns):
 
 
 def test_span_ruler_serialize_bytes(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     ruler_bytes = ruler.to_bytes()
-    new_nlp = spacy.blank("xx")
+    new_nlp = spacy.blank("mul")
     new_ruler = new_nlp.add_pipe("span_ruler")
     assert len(new_ruler) == 0
     assert len(new_ruler.labels) == 0
@@ -180,7 +180,7 @@ def test_span_ruler_serialize_bytes(patterns):
 
 
 def test_span_ruler_validate():
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     validated_ruler = nlp.add_pipe(
         "span_ruler", name="validated_span_ruler", config={"validate": True}
@@ -202,14 +202,14 @@ def test_span_ruler_validate():
 
 
 def test_span_ruler_properties(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
     ruler.add_patterns(patterns)
     assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns]))
 
 
 def test_span_ruler_overlapping_spans(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(overlapping_patterns)
     doc = ruler(nlp.make_doc("foo bar baz"))
@@ -219,7 +219,7 @@ def test_span_ruler_overlapping_spans(overlapping_patterns):
 
 
 def test_span_ruler_scorer(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(overlapping_patterns)
     text = "foo bar baz"
@@ -242,7 +242,7 @@ def test_span_ruler_multiprocessing(n_process):
 
         patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}]
 
-        nlp = spacy.blank("xx")
+        nlp = spacy.blank("mul")
         ruler = nlp.add_pipe("span_ruler")
         ruler.add_patterns(patterns)
 
@@ -252,7 +252,7 @@ def test_span_ruler_multiprocessing(n_process):
 
 
 def test_span_ruler_serialize_dir(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     with make_tempdir() as d:
@@ -263,7 +263,7 @@ def test_span_ruler_serialize_dir(patterns):
 
 
 def test_span_ruler_remove_basic(person_org_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_patterns)
     doc = ruler(nlp.make_doc("Dina went to school"))
@@ -278,7 +278,7 @@ def test_span_ruler_remove_basic(person_org_patterns):
 
 
 def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_patterns)
     assert len(ruler.patterns) == 3
@@ -289,7 +289,7 @@ def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
 
 
 def test_span_ruler_remove_several_patterns(person_org_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_patterns)
     doc = ruler(nlp.make_doc("Dina founded the company ACME."))
@@ -313,7 +313,7 @@ def test_span_ruler_remove_several_patterns(person_org_patterns):
 
 
 def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_date_patterns)
     doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th"))
@@ -331,7 +331,7 @@ def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
 
 
 def test_span_ruler_remove_all_patterns(person_org_date_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_date_patterns)
     assert len(ruler.patterns) == 4
@@ -347,7 +347,7 @@ def test_span_ruler_remove_all_patterns(person_org_date_patterns):
 
 
 def test_span_ruler_remove_and_add():
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     patterns1 = [{"label": "DATE1", "pattern": "last time"}]
     ruler.add_patterns(patterns1)
@@ -403,7 +403,7 @@ def test_span_ruler_remove_and_add():
 
 
 def test_span_ruler_spans_filter(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe(
         "span_ruler",
         config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}},
@@ -415,7 +415,7 @@ def test_span_ruler_spans_filter(overlapping_patterns):
 
 
 def test_span_ruler_ents_default_filter(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True})
     ruler.add_patterns(overlapping_patterns)
     doc = ruler(nlp.make_doc("foo bar baz"))
@@ -424,7 +424,7 @@ def test_span_ruler_ents_default_filter(overlapping_patterns):
 
 
 def test_span_ruler_ents_overwrite_filter(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe(
         "span_ruler",
         config={
@@ -451,7 +451,7 @@ def pass_through_filter(spans1, spans2):
 
         return pass_through_filter
 
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe(
         "span_ruler",
         config={
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 8138cb157d2..b419d77b51d 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -666,11 +666,12 @@ def test_spacy_blank():
         ("fra", "fr"),
         ("fre", "fr"),
         ("iw", "he"),
+        ("is", "isl"),
         ("mo", "ro"),
-        ("mul", "xx"),
+        ("mul", "mul"),
         ("no", "nb"),
         ("pt-BR", "pt"),
-        ("xx", "xx"),
+        ("xx", "mul"),
         ("zh-Hans", "zh"),
         ("zh-Hant", None),
         ("zxx", None),
@@ -691,11 +692,11 @@ def test_language_matching(lang, target):
         ("fra", "fr"),
         ("fre", "fr"),
         ("iw", "he"),
+        ("is", "isl"),
         ("mo", "ro"),
-        ("mul", "xx"),
+        ("xx", "mul"),
         ("no", "nb"),
         ("pt-BR", "pt"),
-        ("xx", "xx"),
         ("zh-Hans", "zh"),
     ],
 )
diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py
index 78932f6539c..073899fa50a 100644
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@@ -36,6 +36,7 @@
     "hu",
     pytest.param("id", marks=pytest.mark.slow()),
     pytest.param("it", marks=pytest.mark.slow()),
+    pytest.param("isl", marks=pytest.mark.slow()),
     pytest.param("kn", marks=pytest.mark.slow()),
     pytest.param("lb", marks=pytest.mark.slow()),
     pytest.param("lt", marks=pytest.mark.slow()),
diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py
index b19d1791b27..c3490d4a494 100644
--- a/spacy/training/converters/conll_ner_to_docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@@ -86,7 +86,7 @@ def conll_ner_to_docs(
     if model:
         nlp = load_model(model)
     else:
-        nlp = get_lang_class("xx")()
+        nlp = get_lang_class("mul")()
     for conll_doc in input_data.strip().split(doc_delimiter):
         conll_doc = conll_doc.strip()
         if not conll_doc:
@@ -133,7 +133,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
             "Segmenting sentences with sentencizer. (Use `-b model` for "
             "improved parser-based sentence segmentation.)"
         )
-        nlp = get_lang_class("xx")()
+        nlp = get_lang_class("mul")()
         sentencizer = nlp.create_pipe("sentencizer")
     lines = doc.strip().split("\n")
     words = [line.strip().split()[0] for line in lines]
diff --git a/spacy/training/converters/json_to_docs.py b/spacy/training/converters/json_to_docs.py
index b4beedd2f27..1ff7a64e09d 100644
--- a/spacy/training/converters/json_to_docs.py
+++ b/spacy/training/converters/json_to_docs.py
@@ -1,13 +1,9 @@
 import srsly
-
-from ...lang.xx import MultiLanguage
-from ...util import load_model
-from ..example import (
-    _fix_legacy_dict_data,
-    _parse_example_dict_data,
-    annotations_to_doc,
-)
 from ..gold_io import json_iterate, json_to_annotations
+from ..example import annotations_to_doc
+from ..example import _fix_legacy_dict_data, _parse_example_dict_data
+from ...util import load_model
+from ...lang.mul import MultiLanguage
 
 
 def json_to_docs(input_data, model=None, **kwargs):
diff --git a/spacy/util.py b/spacy/util.py
index de04ee6e718..8c402a74ce9 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -325,7 +325,7 @@ def find_matching_language(lang: str) -> Optional[str]:
     import spacy.lang  # noqa: F401
 
     if lang == "xx":
-        return "xx"
+        return "mul"
 
     # Find out which language modules we have
     possible_languages = []
@@ -343,11 +343,7 @@ def find_matching_language(lang: str) -> Optional[str]:
     # is labeled that way is probably trying to be distinct from 'zh' and
     # shouldn't automatically match.
     match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
-    if match == "mul":
-        # Convert 'mul' back to spaCy's 'xx'
-        return "xx"
-    else:
-        return match
+    return match
 
 
 def get_lang_class(lang: str) -> Type["Language"]:
diff --git a/website/docs/api/scorer.mdx b/website/docs/api/scorer.mdx
index 9bdd0a8f435..0c2eefc6722 100644
--- a/website/docs/api/scorer.mdx
+++ b/website/docs/api/scorer.mdx
@@ -30,7 +30,7 @@ Create a new `Scorer`.
 | Name               | Description                                                                                                                                                                                                                               |
 | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `nlp`              | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ |
-| `default_lang`     | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~                                                                                                                                            |
+| `default_lang`     | The language to use for a default pipeline if `nlp` is not provided. Defaults to `mul`. ~~str~~                                                                                                                                           |
 | `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~                                                     |
 | _keyword-only_     |                                                                                                                                                                                                                                           |
 | `**kwargs`         | Any additional settings to pass on to the individual scoring methods. ~~Any~~                                                                                                                                                             |
diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx
index e74c37e3080..34927ff3e7b 100644
--- a/website/docs/usage/models.mdx
+++ b/website/docs/usage/models.mdx
@@ -74,23 +74,23 @@ your data.
 
 > ```python
 > # Standard import
-> from spacy.lang.xx import MultiLanguage
+> from spacy.lang.mul import MultiLanguage
 > nlp = MultiLanguage()
 >
 > # With lazy-loading
-> nlp = spacy.blank("xx")
+> nlp = spacy.blank("mul")
 > ```
 
 spaCy also supports pipelines trained on more than one language. This is
 especially useful for named entity recognition. The language ID used for
-multi-language or language-neutral pipelines is `xx`. The language class, a
+multi-language or language-neutral pipelines is `mul`. The language class, a
 generic subclass containing only the base language data, can be found in
-[`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx).
+[`lang/mul`](%%GITHUB_SPACY/spacy/lang/mul).
 
 To train a pipeline using the neutral multi-language class, you can set
-`lang = "xx"` in your [training config](/usage/training#config). You can also
+`lang = "mul"` in your [training config](/usage/training#config). You can also
 \import the `MultiLanguage` class directly, or call
-[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
+[`spacy.blank("mul")`](/api/top-level#spacy.blank) for lazy-loading.
 
 ### Chinese language support {id="chinese",version="2.3"}
 
diff --git a/website/meta/languages.json b/website/meta/languages.json
index d6a07809795..e520067ba20 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -169,7 +169,7 @@
             "has_examples": true
         },
         {
-            "code": "is",
+            "code": "isl",
             "name": "Icelandic"
         },
         {
@@ -454,9 +454,9 @@
             ]
         },
         {
-            "code": "xx",
+            "code": "mul",
             "name": "Multi-language",
-            "models": ["xx_ent_wiki_sm", "xx_sent_ud_sm"],
+            "models": ["mul_ent_wiki_sm", "mul_sent_ud_sm"],
             "example": "This is a sentence about Facebook."
         },
         {
diff --git a/website/src/widgets/quickstart-models.js b/website/src/widgets/quickstart-models.js
index b2a0a628018..4994dc22640 100644
--- a/website/src/widgets/quickstart-models.js
+++ b/website/src/widgets/quickstart-models.js
@@ -103,7 +103,7 @@ const QuickstartInstall = ({ id, title, description, children }) => {
                         </QS>
                         <QS config="example" prompt="python">
                             print([
-                            {code === 'xx'
+                            {code === 'mul'
                                 ? '(ent.text, ent.label) for ent in doc.ents'
                                 : '(w.text, w.pos_) for w in doc'}
                             ])

From 53cf346e9d0236e726de9ab0b119b4ba44586c75 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 1 Feb 2023 17:47:56 +0900
Subject: [PATCH 162/504] Move Entity Linker v1 to spacy-legacy (#12006)

* Move Entity Linker v1 component to spacy-legacy

This is a follow up to #11889 that moves the component instead of
removing it.

In general, we never import from spacy-legacy in spaCy proper. However,
to use this component, that kind of import will be necessary. I was able
to test this without issues, but is this current import strategy
acceptable? Or should we put the component in a registry?

* Use spacy-legacy pr for CI

This will need to be reverted before merging.

* Add temporary step to log installed spacy-legacy version

* Modify requirements.txt to trigger tests

* Add comment to Python to trigger tests

* TODO REVERT This is a commit with logic changes to trigger tests

* Remove pipe from YAML

Works locally, but possibly this is causing a quoting error or
something.

* Revert "TODO REVERT This is a commit with logic changes to trigger tests"

This reverts commit 689fae71f31de4f54a00dd7dae0c26b19563c027.

* Revert "Add comment to Python to trigger tests"

This reverts commit 11840fc59886658c59aeb186a20173f5ec7c4583.

* Add more logging

* Try installing directly in workflow

* Try explicitly uninstalling spacy-legacy first

* Cat requirements.txt to confirm contents

In the branch, the thinc version spec is `thinc>=8.1.0,<8.2.0`. But in
the logs, it's clear that a development release of 9.0 is being
installed. It's not clear why that would happen.

* Log requirements at start of build

* TODO REVERT Change thinc spec

Want to see what happens to the installed thinc spec with this change.

* Update thinc requirements

This makes it the same as it was before the merge, >=8.1.0,<8.2.0.

* Use same thinc version as v4 branch

* TODO REVERT Mark dependency check as xfail

spacy-legacy is specified as a git checkout in requirements.txt while
this PR is in progress, which makes the consistency check here fail.

* Remove debugging output / install step

* Revert "Remove debugging output / install step"

This reverts commit 923ea7448b5e819d73272bc4e43e8880a8598a07.

* Clean up debugging output

The manual install step with the URL fragment seems to have caused
issues on Windows due to the = in the URL being misinterpreted. On the
other hand, removing it seems to mean the git version of spacy-legacy
isn't actually installed.

This PR removes the URL fragment but keeps the direct command-line
install. Additionally, since it looks like this job is configured to use
the default shell (and not bash), it removes a comment that upsets the
Windows cmd shell.

* Revert "TODO REVERT Mark dependency check as xfail"

This reverts commit d4863ec1563b7819c31a865cb94262b7dc592b7e.

* Fix requirements.txt, increasing spacy-legacy version

* Raise spacy legacy version in setup.cfg

* Remove azure build workarounds

* make spacy-legacy version explicit in error message

* Remove debugging line

* Suggestions from code review
---
 spacy/pipeline/entity_linker.py            |  16 +
 spacy/pipeline/legacy/__init__.py          |   3 -
 spacy/pipeline/legacy/entity_linker.py     | 422 ---------------------
 spacy/tests/pipeline/test_entity_linker.py |   3 +-
 4 files changed, 18 insertions(+), 426 deletions(-)
 delete mode 100644 spacy/pipeline/legacy/__init__.py
 delete mode 100644 spacy/pipeline/legacy/entity_linker.py

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 9c4312f6dd8..db4f0e105c1 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -14,6 +14,16 @@
 from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate
 from thinc.types import Floats2d
 
+from ..kb import KnowledgeBase, Candidate
+from ..ml import empty_kb
+from ..tokens import Doc, Span
+from .pipe import deserialize_config
+from .trainable_pipe import TrainablePipe
+from ..language import Language
+from ..vocab import Vocab
+from ..training import Example, validate_examples, validate_get_examples
+from ..errors import Errors
+from ..util import SimpleFrozenList, registry
 from .. import util
 from ..errors import Errors
 from ..kb import Candidate, KnowledgeBase
@@ -128,6 +138,12 @@ def make_entity_linker(
     """
 
     if not model.attrs.get("include_span_maker", False):
+        try:
+            from spacy_legacy.components.entity_linker import EntityLinker_v1
+        except:
+            raise ImportError(
+                "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
+            )
         # The only difference in arguments here is that use_gold_ents and threshold aren't available.
         return EntityLinker_v1(
             nlp.vocab,
diff --git a/spacy/pipeline/legacy/__init__.py b/spacy/pipeline/legacy/__init__.py
deleted file mode 100644
index f216840dc2c..00000000000
--- a/spacy/pipeline/legacy/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .entity_linker import EntityLinker_v1
-
-__all__ = ["EntityLinker_v1"]
diff --git a/spacy/pipeline/legacy/entity_linker.py b/spacy/pipeline/legacy/entity_linker.py
deleted file mode 100644
index 1e46db019d5..00000000000
--- a/spacy/pipeline/legacy/entity_linker.py
+++ /dev/null
@@ -1,422 +0,0 @@
-# This file is present to provide a prior version of the EntityLinker component
-# for backwards compatability. For details see #9669.
-
-import random
-import warnings
-from itertools import islice
-from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
-
-import srsly
-from thinc.api import CosineDistance, Model, Optimizer, set_dropout_rate
-from thinc.types import Floats2d
-
-from ... import util
-from ...errors import Errors, Warnings
-from ...kb import Candidate, KnowledgeBase
-from ...language import Language
-from ...ml import empty_kb
-from ...scorer import Scorer
-from ...tokens import Doc, Span
-from ...training import Example, validate_examples, validate_get_examples
-from ...util import SimpleFrozenList
-from ...vocab import Vocab
-from ..pipe import deserialize_config
-from ..trainable_pipe import TrainablePipe
-
-# See #9050
-BACKWARD_OVERWRITE = True
-
-
-def entity_linker_score(examples, **kwargs):
-    return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs)
-
-
-class EntityLinker_v1(TrainablePipe):
-    """Pipeline component for named entity linking.
-
-    DOCS: https://spacy.io/api/entitylinker
-    """
-
-    NIL = "NIL"  # string used to refer to a non-existing link
-
-    def __init__(
-        self,
-        vocab: Vocab,
-        model: Model,
-        name: str = "entity_linker",
-        *,
-        labels_discard: Iterable[str],
-        n_sents: int,
-        incl_prior: bool,
-        incl_context: bool,
-        entity_vector_length: int,
-        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
-        overwrite: bool = BACKWARD_OVERWRITE,
-        scorer: Optional[Callable] = entity_linker_score,
-    ) -> None:
-        """Initialize an entity linker.
-
-        vocab (Vocab): The shared vocabulary.
-        model (thinc.api.Model): The Thinc Model powering the pipeline component.
-        name (str): The component instance name, used to add entries to the
-            losses during training.
-        labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
-        n_sents (int): The number of neighbouring sentences to take into account.
-        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
-        incl_context (bool): Whether or not to include the local context in the model.
-        entity_vector_length (int): Size of encoding vectors in the KB.
-        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
-            produces a list of candidates, given a certain knowledge base and a textual mention.
-        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
-        DOCS: https://spacy.io/api/entitylinker#init
-        """
-        self.vocab = vocab
-        self.model = model
-        self.name = name
-        self.labels_discard = list(labels_discard)
-        self.n_sents = n_sents
-        self.incl_prior = incl_prior
-        self.incl_context = incl_context
-        self.get_candidates = get_candidates
-        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
-        self.distance = CosineDistance(normalize=False)
-        # how many neighbour sentences to take into account
-        # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
-        self.kb = empty_kb(entity_vector_length)(self.vocab)
-        self.scorer = scorer
-
-    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
-        """Define the KB of this pipe by providing a function that will
-        create it using this object's vocab."""
-        if not callable(kb_loader):
-            raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
-
-        self.kb = kb_loader(self.vocab)
-
-    def validate_kb(self) -> None:
-        # Raise an error if the knowledge base is not initialized.
-        if self.kb is None:
-            raise ValueError(Errors.E1018.format(name=self.name))
-        if len(self.kb) == 0:
-            raise ValueError(Errors.E139.format(name=self.name))
-
-    def initialize(
-        self,
-        get_examples: Callable[[], Iterable[Example]],
-        *,
-        nlp: Optional[Language] = None,
-        kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
-    ):
-        """Initialize the pipe for training, using a representative set
-        of data examples.
-
-        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns a representative sample of gold-standard Example objects.
-        nlp (Language): The current nlp object the component is part of.
-        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
-            Note that providing this argument, will overwrite all data accumulated in the current KB.
-            Use this only when loading a KB as-such from file.
-
-        DOCS: https://spacy.io/api/entitylinker#initialize
-        """
-        validate_get_examples(get_examples, "EntityLinker_v1.initialize")
-        if kb_loader is not None:
-            self.set_kb(kb_loader)
-        self.validate_kb()
-        nO = self.kb.entity_vector_length
-        doc_sample = []
-        vector_sample = []
-        for example in islice(get_examples(), 10):
-            doc_sample.append(example.x)
-            vector_sample.append(self.model.ops.alloc1f(nO))
-        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize(
-            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
-        )
-
-    def update(
-        self,
-        examples: Iterable[Example],
-        *,
-        drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
-        losses: Optional[Dict[str, float]] = None,
-    ) -> Dict[str, float]:
-        """Learn from a batch of documents and gold-standard information,
-        updating the pipe's model. Delegates to predict and get_loss.
-
-        examples (Iterable[Example]): A batch of Example objects.
-        drop (float): The dropout rate.
-        sgd (thinc.api.Optimizer): The optimizer.
-        losses (Dict[str, float]): Optional record of the loss during training.
-            Updated using the component name as the key.
-        RETURNS (Dict[str, float]): The updated losses dictionary.
-
-        DOCS: https://spacy.io/api/entitylinker#update
-        """
-        self.validate_kb()
-        if losses is None:
-            losses = {}
-        losses.setdefault(self.name, 0.0)
-        if not examples:
-            return losses
-        validate_examples(examples, "EntityLinker_v1.update")
-        sentence_docs = []
-        for eg in examples:
-            sentences = [s for s in eg.reference.sents]
-            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.reference.ents:
-                # KB ID of the first token is the same as the whole span
-                kb_id = kb_ids[ent.start]
-                if kb_id:
-                    try:
-                        # find the sentence in the list of sentences.
-                        sent_index = sentences.index(ent.sent)
-                    except AttributeError:
-                        # Catch the exception when ent.sent is None and provide a user-friendly warning
-                        raise RuntimeError(Errors.E030) from None
-                    # get n previous sentences, if there are any
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    # get n posterior sentences, or as many < n as there are
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    # get token positions
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    # append that span as a doc to training
-                    sent_doc = eg.predicted[start_token:end_token].as_doc()
-                    sentence_docs.append(sent_doc)
-        set_dropout_rate(self.model, drop)
-        if not sentence_docs:
-            warnings.warn(Warnings.W093.format(name="Entity Linker"))
-            return losses
-        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
-        loss, d_scores = self.get_loss(
-            sentence_encodings=sentence_encodings, examples=examples
-        )
-        bp_context(d_scores)
-        if sgd is not None:
-            self.finish_update(sgd)
-        losses[self.name] += loss
-        return losses
-
-    def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
-        validate_examples(examples, "EntityLinker_v1.get_loss")
-        entity_encodings = []
-        for eg in examples:
-            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.reference.ents:
-                kb_id = kb_ids[ent.start]
-                if kb_id:
-                    entity_encoding = self.kb.get_vector(kb_id)
-                    entity_encodings.append(entity_encoding)
-        entity_encodings = self.model.ops.asarray2f(entity_encodings)
-        if sentence_encodings.shape != entity_encodings.shape:
-            err = Errors.E147.format(
-                method="get_loss", msg="gold entities do not match up"
-            )
-            raise RuntimeError(err)
-        gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
-        loss = self.distance.get_loss(sentence_encodings, entity_encodings)
-        loss = loss / len(entity_encodings)
-        return float(loss), gradients
-
-    def predict(self, docs: Iterable[Doc]) -> List[str]:
-        """Apply the pipeline's model to a batch of docs, without modifying them.
-        Returns the KB IDs for each entity in each doc, including NIL if there is
-        no prediction.
-
-        docs (Iterable[Doc]): The documents to predict.
-        RETURNS (List[str]): The models prediction for each document.
-
-        DOCS: https://spacy.io/api/entitylinker#predict
-        """
-        self.validate_kb()
-        entity_count = 0
-        final_kb_ids: List[str] = []
-        if not docs:
-            return final_kb_ids
-        if isinstance(docs, Doc):
-            docs = [docs]
-        for i, doc in enumerate(docs):
-            sentences = [s for s in doc.sents]
-            if len(doc) > 0:
-                # Looping through each entity (TODO: rewrite)
-                for ent in doc.ents:
-                    sent = ent.sent
-                    sent_index = sentences.index(sent)
-                    assert sent_index >= 0
-                    # get n_neighbour sentences, clipped to the length of the document
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    sent_doc = doc[start_token:end_token].as_doc()
-                    # currently, the context is the same for each entity in a sentence (should be refined)
-                    xp = self.model.ops.xp
-                    if self.incl_context:
-                        sentence_encoding = self.model.predict([sent_doc])[0]
-                        sentence_encoding_t = sentence_encoding.T
-                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                    entity_count += 1
-                    if ent.label_ in self.labels_discard:
-                        # ignoring this entity - setting to NIL
-                        final_kb_ids.append(self.NIL)
-                    else:
-                        candidates = list(self.get_candidates(self.kb, ent))
-                        if not candidates:
-                            # no prediction possible for this entity - setting to NIL
-                            final_kb_ids.append(self.NIL)
-                        elif len(candidates) == 1:
-                            # shortcut for efficiency reasons: take the 1 candidate
-                            final_kb_ids.append(candidates[0].entity_)
-                        else:
-                            random.shuffle(candidates)
-                            # set all prior probabilities to 0 if incl_prior=False
-                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
-                            if not self.incl_prior:
-                                prior_probs = xp.asarray([0.0 for _ in candidates])
-                            scores = prior_probs
-                            # add in similarity from the context
-                            if self.incl_context:
-                                entity_encodings = xp.asarray(
-                                    [c.entity_vector for c in candidates]
-                                )
-                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
-                                if len(entity_encodings) != len(prior_probs):
-                                    raise RuntimeError(
-                                        Errors.E147.format(
-                                            method="predict",
-                                            msg="vectors not of equal length",
-                                        )
-                                    )
-                                # cosine similarity
-                                sims = xp.dot(entity_encodings, sentence_encoding_t) / (
-                                    sentence_norm * entity_norm
-                                )
-                                if sims.shape != prior_probs.shape:
-                                    raise ValueError(Errors.E161)
-                                scores = prior_probs + sims - (prior_probs * sims)
-                            best_index = scores.argmax().item()
-                            best_candidate = candidates[best_index]
-                            final_kb_ids.append(best_candidate.entity_)
-        if not (len(final_kb_ids) == entity_count):
-            err = Errors.E147.format(
-                method="predict", msg="result variables not of equal length"
-            )
-            raise RuntimeError(err)
-        return final_kb_ids
-
-    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
-        """Modify a batch of documents, using pre-computed scores.
-
-        docs (Iterable[Doc]): The documents to modify.
-        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
-
-        DOCS: https://spacy.io/api/entitylinker#set_annotations
-        """
-        count_ents = len([ent for doc in docs for ent in doc.ents])
-        if count_ents != len(kb_ids):
-            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
-        i = 0
-        overwrite = self.cfg["overwrite"]
-        for doc in docs:
-            for ent in doc.ents:
-                kb_id = kb_ids[i]
-                i += 1
-                for token in ent:
-                    if token.ent_kb_id == 0 or overwrite:
-                        token.ent_kb_id_ = kb_id
-
-    def to_bytes(self, *, exclude=tuple()):
-        """Serialize the pipe to a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
-
-        DOCS: https://spacy.io/api/entitylinker#to_bytes
-        """
-        self._validate_serialization_attrs()
-        serialize = {}
-        if hasattr(self, "cfg") and self.cfg is not None:
-            serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
-        serialize["kb"] = self.kb.to_bytes
-        serialize["model"] = self.model.to_bytes
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, *, exclude=tuple()):
-        """Load the pipe from a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (TrainablePipe): The loaded object.
-
-        DOCS: https://spacy.io/api/entitylinker#from_bytes
-        """
-        self._validate_serialization_attrs()
-
-        def load_model(b):
-            try:
-                self.model.from_bytes(b)
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize = {}
-        if hasattr(self, "cfg") and self.cfg is not None:
-            deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
-        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
-        deserialize["kb"] = lambda b: self.kb.from_bytes(b)
-        deserialize["model"] = load_model
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> None:
-        """Serialize the pipe to disk.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-
-        DOCS: https://spacy.io/api/entitylinker#to_disk
-        """
-        serialize = {}
-        serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
-        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
-        serialize["kb"] = lambda p: self.kb.to_disk(p)
-        serialize["model"] = lambda p: self.model.to_disk(p)
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityLinker_v1":
-        """Load the pipe from disk. Modifies the object in place and returns it.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (EntityLinker): The modified EntityLinker object.
-
-        DOCS: https://spacy.io/api/entitylinker#from_disk
-        """
-
-        def load_model(p):
-            try:
-                with p.open("rb") as infile:
-                    self.model.from_bytes(infile.read())
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize: Dict[str, Callable[[Any], Any]] = {}
-        deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
-        deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
-        deserialize["kb"] = lambda p: self.kb.from_disk(p)
-        deserialize["model"] = load_model
-        util.from_disk(path, deserialize, exclude)
-        return self
-
-    def rehearse(self, examples, *, sgd=None, losses=None, **config):
-        raise NotImplementedError
-
-    def add_label(self, label):
-        raise NotImplementedError
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 80b6e766347..9e955f23e43 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -11,7 +11,6 @@
 from spacy.lang.en import English
 from spacy.ml import load_kb
 from spacy.pipeline import EntityLinker, TrainablePipe
-from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tests.util import make_tempdir
@@ -987,6 +986,8 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
+    from spacy_legacy.components.entity_linker import EntityLinker_v1
+
     # Ensure that the legacy architectures still work
     vector_length = 3
     nlp = English()

From a18b11b74d39d8174acc7f726163c1a1ee28723f Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 2 Feb 2023 22:13:38 +0900
Subject: [PATCH 163/504] Cleanup/remove backwards compat overwrite settings
 (#11888)

* Remove backwards-compatible overwrite from Entity Linker

This also adds a docstring about overwrite, since it wasn't present.

* Fix docstring

* Remove backward compat settings in Morphologizer

This also needed a docstring added.

For this component it's less clear what the right overwrite settings
are.

* Remove backward compat from sentencizer

This was simple

* Remove backward compat from senter

Another simple one

* Remove backward compat setting from tagger

* Add docstrings

* Update spacy/pipeline/morphologizer.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update docs

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/pipeline/entity_linker.py    | 11 +++--------
 spacy/pipeline/morphologizer.pyx   | 11 ++++-------
 spacy/pipeline/sentencizer.pyx     |  7 ++-----
 spacy/pipeline/senter.pyx          |  5 ++---
 spacy/pipeline/tagger.pyx          |  6 ++----
 website/docs/api/entitylinker.mdx  |  2 +-
 website/docs/api/morphologizer.mdx |  2 +-
 7 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index db4f0e105c1..21e3a279749 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -43,9 +43,6 @@
 
 KNOWLEDGE_BASE_IDS = "kb_ids"
 
-# See #9050
-BACKWARD_OVERWRITE = True
-
 default_model_config = """
 [model]
 @architectures = "spacy.EntityLinker.v2"
@@ -76,8 +73,7 @@
         "entity_vector_length": 64,
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
         "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
-        "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
-        "overwrite": True,
+        "overwrite": False,
         "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
         "use_gold_ents": True,
         "candidates_batch_size": 1,
@@ -211,8 +207,7 @@ def __init__(
         get_candidates_batch: Callable[
             [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
         ],
-        generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
-        overwrite: bool = BACKWARD_OVERWRITE,
+        overwrite: bool = False,
         scorer: Optional[Callable] = entity_linker_score,
         use_gold_ents: bool,
         candidates_batch_size: int,
@@ -236,7 +231,7 @@ def __init__(
             Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
             Iterable[Candidate]]
             ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
-        generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
+        overwrite (bool): Whether to overwrite existing non-empty annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
         use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
             component must provide entity annotations.
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index d3068bdffdd..5e7d0720a40 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -28,10 +28,6 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .tagger import Tagger
 
-# See #9050
-BACKWARD_OVERWRITE = True
-BACKWARD_EXTEND = False
-
 default_model_config = """
 [model]
 @architectures = "spacy.Tagger.v2"
@@ -113,9 +109,8 @@ class Morphologizer(Tagger):
         model: Model,
         name: str = "morphologizer",
         *,
-        overwrite: bool = BACKWARD_OVERWRITE,
-        extend: bool = BACKWARD_EXTEND,
-        label_smoothing: float = 0.0,
+        overwrite: bool = False,
+        extend: bool = False,
         scorer: Optional[Callable] = morphologizer_score,
         save_activations: bool = False,
     ):
@@ -125,6 +120,8 @@ class Morphologizer(Tagger):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
+        overwrite (bool): Whether to overwrite existing annotations.
+        extend (bool): Whether to extend existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attributes "pos" and "morph" and
             Scorer.score_token_attr_per_feat for the attribute "morph".
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 08ba9d989c1..02b92e87812 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -10,10 +10,6 @@ from ..language import Language
 from .pipe import Pipe
 from .senter import senter_score
 
-# see #9050
-BACKWARD_OVERWRITE = False
-
-
 @Language.factory(
     "sentencizer",
     assigns=["token.is_sent_start", "doc.sents"],
@@ -55,13 +51,14 @@ class Sentencizer(Pipe):
         name="sentencizer",
         *,
         punct_chars=None,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
         scorer=senter_score,
     ):
         """Initialize the sentencizer.
 
         punct_chars (list): Punctuation characters to split on. Will be
             serialized with the nlp object.
+        overwrite (bool): Whether to overwrite existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_spans for the attribute "sents".
 
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 185430c122c..ba45df28400 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -20,8 +20,6 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .tagger import Tagger
 
-# See #9050
-BACKWARD_OVERWRITE = False
 
 default_model_config = """
 [model]
@@ -85,7 +83,7 @@ class SentenceRecognizer(Tagger):
         model,
         name="senter",
         *,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
         scorer=senter_score,
         save_activations: bool = False,
     ):
@@ -95,6 +93,7 @@ class SentenceRecognizer(Tagger):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
+        overwrite (bool): Whether to overwrite existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_spans for the attribute "sents".
         save_activations (bool): save model activations in Doc when annotating.
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index a8a89332bd4..8740058174a 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -26,9 +26,6 @@ from .trainable_pipe import TrainablePipe
 
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
-# See #9050
-BACKWARD_OVERWRITE = False
-
 default_model_config = """
 [model]
 @architectures = "spacy.Tagger.v2"
@@ -98,7 +95,7 @@ class Tagger(TrainablePipe):
         model,
         name="tagger",
         *,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
         scorer=tagger_score,
         neg_prefix="!",
         save_activations: bool = False,
@@ -109,6 +106,7 @@ class Tagger(TrainablePipe):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
+        overwrite (bool): Whether to overwrite existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attribute "tag".
         save_activations (bool): save model activations in Doc when annotating.
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index 238b62a2e6d..12b2f6bef1d 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -63,7 +63,7 @@ architectures and their arguments and hyperparameters.
 | `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
 | `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
 | `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                    |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                   |
 | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
 | `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 4660ec312fa..9514bc773b9 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters.
 | Setting                                         | Description                                                                                                                                                                                                                                                            |
 | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `model`                                         | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                 |
 | `extend` <Tag variant="new">3.2</Tag>           | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
 | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                       |

From 91f0b277a517932ced3d09cc5179b5876b91fa54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 3 Feb 2023 15:22:25 +0100
Subject: [PATCH 164/504] `Language.update`: ensure that tok2vec gets updated
 (#12136)

* `Language.update`: ensure that tok2vec gets updated

The components in a pipeline can be updated independently. However,
tok2vec implementations are an exception to this, since they depend on
listeners for their gradients. The update method of a tok2vec
implementation computes the tok2vec forward and passes this along with a
backprop function to the listeners. This backprop function accumulates
gradients for all the listeners. There are two ways in which the
accumulated gradients can be used to update the tok2vec weights:

1. Call the `finish_update` method of tok2vec *after* the `update`
   method is called on all of the pipes that use a tok2vec listener.
2. Pass an optimizer to the `update` method of tok2vec. In this
   case, tok2vec will give the last listener a special backprop
   function that calls `finish_update` on the tok2vec.

Unfortunately, `Language.update` did neither of these. Instead, it
immediately called `finish_update` on every pipe after `update`. As a
result, the tok2vec weights are updated when no gradients have been
accumulated from listeners yet. And the gradients of the listeners are
only used in the next call to `Language.update` (when `finish_update` is
called on tok2vec again).

This change fixes this issue by passing the optimizer to the `update`
method of trainable pipes, leading to use of the second strategy
outlined above.

The main updating loop in `Language.update` is also simplified by using
the `TrainableComponent` protocol consistently.

* Train loop: `sgd` is `Optional[Optimizer]`, do not pass false

* Language.update: call pipe finish_update after all pipe updates

This does correct and fast updates if multiple components update the
same parameters.

* Add comment why we moved `finish_update` to a separate loop
---
 spacy/language.py                             | 28 ++++---
 .../pipeline/test_annotates_on_update.py      | 12 ++-
 spacy/tests/test_language.py                  | 73 ++++++++++++++++++-
 spacy/training/loop.py                        |  2 +-
 4 files changed, 99 insertions(+), 16 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index cb9652e97bf..51189ab371a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1280,17 +1280,12 @@ def update(
             component_cfg[name].setdefault("drop", drop)
             pipe_kwargs[name].setdefault("batch_size", self.batch_size)
         for name, proc in self.pipeline:
-            # ignore statements are used here because mypy ignores hasattr
-            if name not in exclude and hasattr(proc, "update"):
-                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])  # type: ignore
-            if sgd not in (None, False):
-                if (
-                    name not in exclude
-                    and isinstance(proc, ty.TrainableComponent)
-                    and proc.is_trainable
-                    and proc.model not in (True, False, None)
-                ):
-                    proc.finish_update(sgd)
+            if (
+                name not in exclude
+                and isinstance(proc, ty.TrainableComponent)
+                and proc.is_trainable
+            ):
+                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
             if name in annotates:
                 for doc, eg in zip(
                     _pipe(
@@ -1303,6 +1298,17 @@ def update(
                     examples,
                 ):
                     eg.predicted = doc
+        # Only finish the update after all component updates are done. Some
+        # components may share weights (such as tok2vec) and we only want
+        # to apply weight updates after all gradients are accumulated.
+        for name, proc in self.pipeline:
+            if (
+                name not in exclude
+                and isinstance(proc, ty.TrainableComponent)
+                and proc.is_trainable
+            ):
+                proc.finish_update(sgd)
+
         return losses
 
     def rehearse(
diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py
index d4feebd3045..f13a0ae5a3c 100644
--- a/spacy/tests/pipeline/test_annotates_on_update.py
+++ b/spacy/tests/pipeline/test_annotates_on_update.py
@@ -55,9 +55,11 @@ def assert_sents(nlp, name):
         return AssertSents(name)
 
     class AssertSents:
+        model = None
+        is_trainable = True
+
         def __init__(self, name, **cfg):
             self.name = name
-            pass
 
         def __call__(self, doc):
             if not doc.has_annotation("SENT_START"):
@@ -65,10 +67,16 @@ def __call__(self, doc):
             return doc
 
         def update(self, examples, *, drop=0.0, sgd=None, losses=None):
+            losses.setdefault(self.name, 0.0)
+
             for example in examples:
                 if not example.predicted.has_annotation("SENT_START"):
                     raise ValueError("No sents")
-            return {}
+
+            return losses
+
+        def finish_update(self, sgd=None):
+            pass
 
     nlp = English()
     nlp.add_pipe("sentencizer")
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index b419d77b51d..88ef3d434c0 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -13,8 +13,12 @@
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, Span
 from spacy.training import Example
-from spacy.util import find_matching_language, ignore_error, raise_error, registry
-from spacy.vocab import Vocab
+from spacy.lang.en import English
+from spacy.lang.de import German
+from spacy.util import registry, ignore_error, raise_error, find_matching_language
+from spacy.util import load_model_from_config
+import spacy
+from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
 
 from .util import add_vecs_to_vocab, assert_docs_equal
 
@@ -27,6 +31,51 @@
 except ImportError:
     pass
 
+TAGGER_CFG_STRING = """
+    [nlp]
+    lang = "en"
+    pipeline = ["tok2vec","tagger"]
+
+    [components]
+
+    [components.tagger]
+    factory = "tagger"
+
+    [components.tagger.model]
+    @architectures = "spacy.Tagger.v2"
+    nO = null
+
+    [components.tagger.model.tok2vec]
+    @architectures = "spacy.Tok2VecListener.v1"
+    width = ${components.tok2vec.model.encode.width}
+
+    [components.tok2vec]
+    factory = "tok2vec"
+
+    [components.tok2vec.model]
+    @architectures = "spacy.Tok2Vec.v2"
+
+    [components.tok2vec.model.embed]
+    @architectures = "spacy.MultiHashEmbed.v1"
+    width = ${components.tok2vec.model.encode.width}
+    rows = [2000, 1000, 1000, 1000]
+    attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+    include_static_vectors = false
+
+    [components.tok2vec.model.encode]
+    @architectures = "spacy.MaxoutWindowEncoder.v2"
+    width = 96
+    depth = 4
+    window_size = 1
+    maxout_pieces = 3
+    """
+
+
+TAGGER_TRAIN_DATA = [
+    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
 
 TAGGER_TRAIN_DATA = [
     ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
@@ -93,6 +142,26 @@ def test_language_update(nlp):
         example = Example.from_dict(doc, wrongkeyannots)
 
 
+def test_language_update_updates():
+    config = Config().from_str(TAGGER_CFG_STRING)
+    nlp = load_model_from_config(config, auto_fill=True, validate=True)
+
+    train_examples = []
+    for t in TAGGER_TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+    nlp.update(train_examples, sgd=optimizer)
+    docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+
+    xp = get_array_module(docs_after_update[0].tensor)
+    assert xp.any(
+        xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
+    )
+
+
 def test_language_evaluate(nlp):
     text = "hello world"
     annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 58d5b06786f..e6b3451cd73 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -221,7 +221,7 @@ def train_while_improving(
                 subbatch,
                 drop=dropout,
                 losses=losses,
-                sgd=False,  # type: ignore[arg-type]
+                sgd=None,
                 exclude=exclude,
                 annotates=annotating_components,
             )

From e185d338a898b592ef7b927826315fa2bb476ac1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:28:34 +0100
Subject: [PATCH 165/504] Use the same tuple in Span cmp and hash (#12251)

---
 spacy/tokens/span.pyx | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index bf37f955d98..7da47616489 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -133,9 +133,8 @@ cdef class Span:
             else:
                 return True
 
-        cdef SpanC* span_c = self.span_c()
-        cdef SpanC* other_span_c = other.span_c()
-
+        self_tuple = self._cmp_tuple()
+        other_tuple = other._cmp_tuple()
         # <
         if op == 0:
             return span_c.start_char < other_span_c.start_char
@@ -170,8 +169,20 @@ cdef class Span:
             return span_c.start_char >= other_span_c.start_char
 
     def __hash__(self):
+        return hash(self._cmp_tuple())
+
+    def _cmp_tuple(self):
         cdef SpanC* span_c = self.span_c()
-        return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id))
+        return (
+            span_c.start_char,
+            span_c.end_char,
+            span_c.start,
+            span_c.end,
+            span_c.label,
+            span_c.kb_id,
+            span_c.id,
+            self.doc,
+        )
 
     def __len__(self):
         """Get the number of tokens in the span.

From 927f8cc423d8a701fdd3e070a0d9d395f61f53b0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:37:42 +0100
Subject: [PATCH 166/504] Remove names for vectors (#12243)

* Remove names for vectors

Named vectors are basically a carry-over from v2 and aren't used for
anything.

* Format
---
 spacy/cli/init_pipeline.py                    |  2 --
 spacy/language.py                             | 14 +----------
 .../serialize/test_serialize_pipeline.py      |  2 +-
 spacy/tests/vocab_vectors/test_vectors.py     | 13 +++++-----
 spacy/training/initialize.py                  |  7 ------
 spacy/vectors.pyx                             |  5 +---
 spacy/vocab.pyi                               |  4 ++--
 spacy/vocab.pyx                               | 24 ++++++-------------
 website/docs/api/cli.mdx                      |  6 ++---
 website/docs/api/vectors.mdx                  |  1 -
 website/docs/api/vocab.mdx                    |  1 -
 11 files changed, 20 insertions(+), 59 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 0ff39d2145b..1a044dedbc9 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -21,7 +21,6 @@ def init_vectors_cli(
     prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
     truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
     mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
-    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
     attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
@@ -45,7 +44,6 @@ def init_vectors_cli(
         vectors_loc,
         truncate=truncate,
         prune=prune,
-        name=name,
         mode=mode,
         attr=attr,
     )
diff --git a/spacy/language.py b/spacy/language.py
index 51189ab371a..e8a7d719ef2 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -194,12 +194,7 @@ def __init__(
         if not isinstance(vocab, Vocab) and vocab is not True:
             raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
         if vocab is True:
-            vectors_name = meta.get("vectors", {}).get("name")
-            vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
-            if not create_vectors:
-                vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
-                create_vectors = registry.resolve(vectors_cfg)["vectors"]
-            vocab.vectors = create_vectors(vocab)
+            vocab = create_vocab(self.lang, self.Defaults)
         else:
             if (self.lang and vocab.lang) and (self.lang != vocab.lang):
                 raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@@ -253,7 +248,6 @@ def meta(self) -> Dict[str, Any]:
             "width": self.vocab.vectors_length,
             "vectors": len(self.vocab.vectors),
             "keys": self.vocab.vectors.n_keys,
-            "name": self.vocab.vectors.name,
             "mode": self.vocab.vectors.mode,
         }
         self._meta["labels"] = dict(self.pipe_labels)
@@ -2275,9 +2269,6 @@ def deserialize_meta(path: Path) -> None:
             if path.exists():
                 data = srsly.read_json(path)
                 self.meta.update(data)
-                # self.meta always overrides meta["vectors"] with the metadata
-                # from self.vocab.vectors, so set the name directly
-                self.vocab.vectors.name = data.get("vectors", {}).get("name")
 
         def deserialize_vocab(path: Path) -> None:
             if path.exists():
@@ -2346,9 +2337,6 @@ def from_bytes(
         def deserialize_meta(b):
             data = srsly.json_loads(b)
             self.meta.update(data)
-            # self.meta always overrides meta["vectors"] with the metadata
-            # from self.vocab.vectors, so set the name directly
-            self.vocab.vectors.name = data.get("vectors", {}).get("name")
 
         deserializers: Dict[str, Callable[[bytes], Any]] = {}
         deserializers["config.cfg"] = lambda b: self.config.from_bytes(
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 8170488f758..39fbbf58217 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -181,7 +181,7 @@ def test_issue4042_bug2():
 @pytest.mark.issue(4725)
 def test_issue4725_1():
     """Ensure the pickling of the NER goes well"""
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    vocab = Vocab()
     nlp = English(vocab=vocab)
     config = {
         "update_with_oracle_cut_size": 111,
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 7172913141c..16574656bfb 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -84,7 +84,7 @@ def test_issue1539():
 @pytest.mark.issue(1807)
 def test_issue1807():
     """Test vocab.set_vector also adds the word to the vocab."""
-    vocab = Vocab(vectors_name="test_issue1807")
+    vocab = Vocab()
     assert "hello" not in vocab
     vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
     assert "hello" in vocab
@@ -94,13 +94,12 @@ def test_issue1807():
 def test_issue2871():
     """Test that vectors recover the correct key for spaCy reserved words."""
     words = ["dog", "cat", "SUFFIX"]
-    vocab = Vocab(vectors_name="test_issue2871")
+    vocab = Vocab()
     vocab.vectors.resize(shape=(3, 10))
     vector_data = numpy.zeros((3, 10), dtype="f")
     for word in words:
         _ = vocab[word]  # noqa: F841
         vocab.set_vector(word, vector_data[0])
-    vocab.vectors.name = "dummy_vectors"
     assert vocab["dog"].rank == 0
     assert vocab["cat"].rank == 1
     assert vocab["SUFFIX"].rank == 2
@@ -125,7 +124,7 @@ def test_issue4725_2():
         # ensures that this runs correctly and doesn't hang or crash because of the global vectors
         # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
         # or because of issues with pickling the NER (cf test_issue4725_1)
-        vocab = Vocab(vectors_name="test_vocab_add_vector")
+        vocab = Vocab()
         data = numpy.ndarray((5, 3), dtype="f")
         data[0] = 1.0
         data[1] = 2.0
@@ -340,7 +339,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2):
 
 
 def test_vocab_add_vector():
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    vocab = Vocab()
     data = OPS.xp.ndarray((5, 3), dtype="f")
     data[0] = 1.0
     data[1] = 2.0
@@ -356,7 +355,7 @@ def test_vocab_add_vector():
 
 
 def test_vocab_prune_vectors():
-    vocab = Vocab(vectors_name="test_vocab_prune_vectors")
+    vocab = Vocab()
     _ = vocab["cat"]  # noqa: F841
     _ = vocab["dog"]  # noqa: F841
     _ = vocab["kitten"]  # noqa: F841
@@ -406,7 +405,7 @@ def test_vectors_serialize():
 
 
 def test_vector_is_oov():
-    vocab = Vocab(vectors_name="test_vocab_is_oov")
+    vocab = Vocab()
     data = OPS.xp.ndarray((5, 3), dtype="f")
     data[0] = 1.0
     data[1] = 2.0
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 0621702214c..191821e786e 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -214,7 +214,6 @@ def convert_vectors(
     *,
     truncate: int,
     prune: int,
-    name: Optional[str] = None,
     mode: str = VectorsMode.default,
     attr: str = "ORTH",
 ) -> None:
@@ -262,12 +261,6 @@ def convert_vectors(
                     attr=attr,
                 )
                 nlp.vocab.deduplicate_vectors()
-    if name is None:
-        # TODO: Is this correct? Does this matter?
-        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
-    else:
-        nlp.vocab.vectors.name = name
-    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
     if prune >= 1 and mode != VectorsMode.floret:
         nlp.vocab.prune_vectors(prune)
 
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 6ff99bb59eb..e16efd2738d 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -124,7 +124,6 @@ cdef class Vectors(BaseVectors):
     DOCS: https://spacy.io/api/vectors
     """
     cdef public object strings
-    cdef public object name
     cdef readonly object mode
     cdef public object data
     cdef public object key2row
@@ -137,14 +136,13 @@ cdef class Vectors(BaseVectors):
     cdef readonly unicode eow
     cdef readonly attr_id_t attr
 
-    def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">", attr="ORTH"):
+    def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
         """Create a new vector store.
 
         strings (StringStore): The string store.
         shape (tuple): Size of the table, as (# entries, # columns)
         data (numpy.ndarray or cupy.ndarray): The vector data.
         keys (iterable): A sequence of keys, aligned with the data.
-        name (str): A name to identify the vectors table.
         mode (str): Vectors mode: "default" or "floret" (default: "default").
         minn (int): The floret char ngram minn (default: 0).
         maxn (int): The floret char ngram maxn (default: 0).
@@ -160,7 +158,6 @@ cdef class Vectors(BaseVectors):
         self.strings = strings
         if self.strings is None:
             self.strings = StringStore()
-        self.name = name
         if mode not in Mode.values():
             raise ValueError(
                 Errors.E202.format(
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
index 7f5f23e7847..7fbb9764f10 100644
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@@ -12,7 +12,8 @@ from .tokens import Doc, Span
 from .vectors import Vectors
 
 def create_vocab(
-    lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
+    lang: Optional[str],
+    defaults: Any,
 ) -> Vocab: ...
 
 class Vocab:
@@ -29,7 +30,6 @@ class Vocab:
         strings: Optional[Union[List[str], StringStore]] = ...,
         lookups: Optional[Lookups] = ...,
         oov_prob: float = ...,
-        vectors_name: Optional[str] = ...,
         writing_system: Dict[str, Any] = ...,
         get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
     ) -> None: ...
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 8ac1215dead..3145f51844a 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -20,7 +20,7 @@ from .vectors import Mode as VectorsMode
 from .vectors import Vectors
 
 
-def create_vocab(lang, defaults, vectors_name=None):
+def create_vocab(lang, defaults):
     # If the spacy-lookups-data package is installed, we pre-populate the lookups
     # with lexeme data, if available
     lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
@@ -36,7 +36,6 @@ def create_vocab(lang, defaults, vectors_name=None):
         lex_attr_getters=lex_attrs,
         writing_system=defaults.writing_system,
         get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
-        vectors_name=vectors_name,
     )
 
 
@@ -47,17 +46,9 @@ cdef class Vocab:
 
     DOCS: https://spacy.io/api/vocab
     """
-    def __init__(
-        self,
-        lex_attr_getters=None,
-        strings=tuple(),
-        lookups=None,
-        oov_prob=-20.,
-        vectors_name=None,
-        writing_system={},  # no-cython-lint
-        get_noun_chunks=None,
-        **deprecated_kwargs
-    ):
+    def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
+                 oov_prob=-20., writing_system={}, get_noun_chunks=None,
+                 **deprecated_kwargs):
         """Create the vocabulary.
 
         lex_attr_getters (dict): A dictionary mapping attribute IDs to
@@ -66,7 +57,6 @@ cdef class Vocab:
             vice versa.
         lookups (Lookups): Container for large lookup tables and dictionaries.
         oov_prob (float): Default OOV probability.
-        vectors_name (str): Optional name to identify the vectors table.
         get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]):
             A function that yields base noun phrases used for Doc.noun_chunks.
         """
@@ -83,7 +73,7 @@ cdef class Vocab:
                 _ = self[string]
         self.lex_attr_getters = lex_attr_getters
         self.morphology = Morphology(self.strings)
-        self.vectors = Vectors(strings=self.strings, name=vectors_name)
+        self.vectors = Vectors(strings=self.strings)
         self.lookups = lookups
         self.writing_system = writing_system
         self.get_noun_chunks = get_noun_chunks
@@ -320,7 +310,7 @@ cdef class Vocab:
             for key, row in self.vectors.key2row.items()
         }
         # replace vectors with deduplicated version
-        self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name)
+        self.vectors = Vectors(strings=self.strings, data=data)
         for key, row in key2row.items():
             self.vectors.add(key, row=row)
 
@@ -377,7 +367,7 @@ cdef class Vocab:
         keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
         keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
         toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
-        self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name)
+        self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row])
         syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
         syn_keys = ops.to_numpy(syn_keys)
         remap = {}
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index acc2ce1caa2..3f91e1ff71e 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -225,7 +225,7 @@ This functionality was previously available as part of the command `init-model`.
 </Infobox>
 
 ```bash
-$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
+$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--verbose]
 ```
 
 | Name               | Description                                                                                                                                                                                                                                                         |
@@ -235,9 +235,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
 | `--prune`, `-p`    | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
-| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~str \(option)~~                                                                                                                                                |
-| `--attr`, `-a`     | Token attribute to use for vectors, e.g. `LOWER` or `NORM`) Defaults to `ORTH`. ~~str \(option)~~                                                                                                                                                                   |
-| `--name`, `-n`     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
+| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~                                                                                                                                      |
 | `--verbose`, `-V`  | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                                                                      |
 | `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
 | **CREATES**        | A spaCy pipeline directory containing the vocab and vectors.                                                                                                                                                                                                        |
diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx
index 0e92eb12ba4..39b309e1377 100644
--- a/website/docs/api/vectors.mdx
+++ b/website/docs/api/vectors.mdx
@@ -52,7 +52,6 @@ modified later.
 | `shape`                                   | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
 | `data`                                    | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~                                                                                                                              |
 | `keys`                                    | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~                                                                                                                |
-| `name`                                    | A name to identify the vectors table. ~~str~~                                                                                                                                          |
 | `mode` <Tag variant="new">3.2</Tag>       | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~                                                                         |
 | `minn` <Tag variant="new">3.2</Tag>       | The floret char ngram minn (default: `0`). ~~int~~                                                                                                                                     |
 | `maxn` <Tag variant="new">3.2</Tag>       | The floret char ngram maxn (default: `0`). ~~int~~                                                                                                                                     |
diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx
index 57618397da5..36369c78427 100644
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@@ -34,7 +34,6 @@ Create the vocabulary.
 | `strings`          | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~                           |
 | `lookups`          | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~                                      |
 | `oov_prob`         | The default OOV probability. Defaults to `-20.0`. ~~float~~                                                                                                             |
-| `vectors_name`     | A name to identify the vectors table. ~~str~~                                                                                                                           |
 | `writing_system`   | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~                          |
 | `get_noun_chunks`  | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
 

From 37a57325b660e26ab41d45e0dbb31fa550ccf76e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:46:07 +0100
Subject: [PATCH 167/504] Remove unused Span.char_span(id=) (#12250)

---
 spacy/tokens/span.pyi     | 1 -
 spacy/tokens/span.pyx     | 3 +--
 website/docs/api/span.mdx | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index ae4a6209e7e..373b4ed1afe 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -103,7 +103,6 @@ class Span:
         label: Union[int, str] = ...,
         kb_id: Union[int, str] = ...,
         vector: Optional[Floats1d] = ...,
-        id: Union[int, str] = ...,
         alignment_mode: str = ...,
         span_id: Union[int, str] = ...,
     ) -> Span: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 7da47616489..3f8630c638e 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -694,7 +694,7 @@ cdef class Span:
         else:
             return self.doc[root]
 
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
         """Create a `Span` object from the slice `span.text[start : end]`.
 
         start (int): The index of the first character of the span.
@@ -704,7 +704,6 @@ cdef class Span:
         kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a named entity.
         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
             the span.
-        id (Union[int, str]): Unused.
         alignment_mode (str): How character indices are aligned to token
             boundaries. Options: "strict" (character indices must be aligned
             with token boundaries), "contract" (span of all tokens completely
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index 1774a298ff2..fa5791c405e 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -193,7 +193,6 @@ the character indices don't map to a valid span.
 | `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
 | `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
 | `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
-| `id`                                            | Unused. ~~Union[int, str]~~                                                                                                                                                                                                                                                  |
 | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
 | `span_id` <Tag variant="new">3.5.1</Tag>        | An identifier to associate with the span. ~~Union[int, str]~~                                                                                                                                                                                                                |
 | **RETURNS**                                     | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |

From 80777aed6b02b7e9d56cbc9e199c705bc0cc0256 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 15 Feb 2023 12:34:33 +0100
Subject: [PATCH 168/504] Make Span.char_span optional args keyword-only
 (#12257)

* Make Span.char_span optional args keyword-only

* Make kb_id and following kw-only

* Format
---
 spacy/tokens/doc.pyi      | 3 ++-
 spacy/tokens/doc.pyx      | 4 ++--
 spacy/tokens/span.pyi     | 1 +
 spacy/tokens/span.pyx     | 6 +++---
 website/docs/api/doc.mdx  | 1 +
 website/docs/api/span.mdx | 5 +++--
 6 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 97c3f69f430..11f8a1c5eb8 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -124,6 +124,7 @@ class Doc:
         start_idx: int,
         end_idx: int,
         label: Union[int, str] = ...,
+        *,
         kb_id: Union[int, str] = ...,
         vector: Optional[Floats1d] = ...,
         alignment_mode: str = ...,
@@ -151,7 +152,7 @@ class Doc:
         blocked: Optional[List[Span]] = ...,
         missing: Optional[List[Span]] = ...,
         outside: Optional[List[Span]] = ...,
-        default: str = ...
+        default: str = ...,
     ) -> None: ...
     @property
     def noun_chunks(self) -> Iterator[Span]: ...
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 56ee216d17f..79bb965bb3c 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -556,7 +556,7 @@ cdef class Doc:
     def doc(self):
         return self
 
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
         """Create a `Span` object from the slice
         `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
         created.
@@ -1647,7 +1647,7 @@ cdef class Doc:
         for span_group in doc_json.get("spans", {}):
             spans = []
             for span in doc_json["spans"][span_group]:
-                char_span = self.char_span(span["start"], span["end"], span["label"], span["kb_id"])
+                char_span = self.char_span(span["start"], span["end"], span["label"], kb_id=span["kb_id"])
                 if char_span is None:
                     raise ValueError(Errors.E1039.format(obj="span", start=span["start"], end=span["end"]))
                 spans.append(char_span)
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 373b4ed1afe..3c85542bb3d 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -101,6 +101,7 @@ class Span:
         start_idx: int,
         end_idx: int,
         label: Union[int, str] = ...,
+        *,
         kb_id: Union[int, str] = ...,
         vector: Optional[Floats1d] = ...,
         alignment_mode: str = ...,
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 3f8630c638e..883a67f3dd6 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -694,11 +694,11 @@ cdef class Span:
         else:
             return self.doc[root]
 
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
         """Create a `Span` object from the slice `span.text[start : end]`.
 
-        start (int): The index of the first character of the span.
-        end (int): The index of the first character after the span.
+        start_idx (int): The index of the first character of the span.
+        end_idx (int): The index of the first character after the span.
         label (Union[int, str]): A label to attach to the Span, e.g. for
             named entities.
         kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a named entity.
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index 28757cbc45f..f53e209afc8 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -214,6 +214,7 @@ alignment mode `"strict".
 | `start`                                  | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
 | `end`                                    | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
 | `label`                                  | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| _keyword-only_                           |                                                                                                                                                                                                                                                                              |
 | `kb_id`                                  | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
 | `vector`                                 | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
 | `alignment_mode`                         | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index fa5791c405e..ae7ef7203b6 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -188,9 +188,10 @@ the character indices don't map to a valid span.
 
 | Name                                            | Description                                                                                                                                                                                                                                                                  |
 | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`                                         | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
-| `end`                                           | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
+| `start_idx`                                     | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
+| `end_idx`                                       | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
 | `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| _keyword-only_                                  |                                                                                                                                                                                                                                                                              |
 | `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
 | `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
 | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |

From a7fd008b0669bc14c15cd2e0a8ab1427d6b69839 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 16 Feb 2023 19:08:55 +0900
Subject: [PATCH 169/504] Use tempfile.TemporaryDirectory (#12285)

---
 spacy/util.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index 8c402a74ce9..7448da8ded0 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1056,22 +1056,11 @@ def make_tempdir() -> Generator[Path, None, None]:
     its contents at the end of the with block.
     YIELDS (Path): The path of the temp directory.
     """
-    d = Path(tempfile.mkdtemp())
-    yield d
-
-    # On Windows, git clones use read-only files, which cause permission errors
-    # when being deleted. This forcibly fixes permissions.
-    def force_remove(rmfunc, path, ex):
-        os.chmod(path, stat.S_IWRITE)
-        rmfunc(path)
-
     try:
-        if sys.version_info >= (3, 12):
-            shutil.rmtree(str(d), onexc=force_remove)
-        else:
-            shutil.rmtree(str(d), onerror=force_remove)
+        with tempfile.TemporaryDirectory() as td:
+            yield Path(td)
     except PermissionError as e:
-        warnings.warn(Warnings.W091.format(dir=d, msg=e))
+        warnings.warn(Warnings.W091.format(dir=td, msg=e))
 
 
 def is_in_jupyter() -> bool:

From 4ad7d0850fad74b0ae51d43584f0ebfc8ffc7469 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 21 Feb 2023 15:47:18 +0100
Subject: [PATCH 170/504] Reimplement distillation with oracle cut size
 (#12214)

* Improve the correctness of _parse_patch

* If there are no more actions, do not attempt to make further
  transitions, even if not all states are final.
* Assert that the number of actions for a step is the same as
  the number of states.

* Reimplement distillation with oracle cut size

The code for distillation with an oracle cut size was not reimplemented
after the parser refactor. We did not notice, because we did not have
tests for this functionality. This change brings back the functionality
and adds this to the parser tests.

* Rename states2actions to _states_to_actions for consistency

* Test distillation max cuts in NER

* Mark parser/NER tests as slow

* Typo

* Fix invariant in _states_diff_to_actions

* Rename _init_batch -> _init_batch_from_teacher

* Ninja edit the ninja edit

* Check that we raise an exception when we pass the incorrect number or actions

* Remove unnecessary get

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Write out condition more explicitly

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/ml/tb_framework.pyx            |   4 +-
 spacy/pipeline/transition_parser.pyx | 101 +++++++++++++++++++++------
 spacy/tests/parser/test_model.py     |  61 ++++++++++++++++
 spacy/tests/parser/test_ner.py       |   5 +-
 spacy/tests/parser/test_parse.py     |   5 +-
 5 files changed, 152 insertions(+), 24 deletions(-)
 create mode 100644 spacy/tests/parser/test_model.py

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 79be13b00bd..9b2114900d3 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
     cdef np.ndarray step_actions
 
     scores = []
-    while sizes.states >= 1:
+    while sizes.states >= 1 and (actions is None or len(actions) > 0):
         step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
         step_actions = actions[0] if actions is not None else None
+        assert step_actions is None or step_actions.size == sizes.states, \
+            f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
         with nogil:
             _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
             if actions is None:
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 6a50dbacaeb..ef2e3314e85 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -45,6 +45,11 @@ from ..errors import Errors
 from ..training import validate_examples, validate_get_examples
 from ._parser_internals import _beam_utils
 
+# TODO: Remove when we switch to Cython 3.
+cdef extern from "<algorithm>" namespace "std" nogil:
+    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
+
+
 NUMPY_OPS = NumpyOps()
 
 
@@ -262,8 +267,8 @@ class Parser(TrainablePipe):
             # batch uniform length. Since we do not have a gold standard
             # sequence, we use the teacher's predictions as the gold
             # standard.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states = self._init_batch(teacher_pipe, student_docs, max_moves)
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
+            states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
@@ -274,12 +279,12 @@ class Parser(TrainablePipe):
         # gradients of the student's transition distributions relative to the
         # teacher's distributions.
 
-        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
-            max_moves=max_moves)
+        student_inputs = TransitionModelInputs(docs=student_docs,
+            states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
+        actions = _states_diff_to_actions(states, student_states)
         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-            moves=self.moves, actions=actions)
+            states=states, moves=teacher_pipe.moves, actions=actions)
         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@@ -528,7 +533,7 @@ class Parser(TrainablePipe):
         set_dropout_rate(self.model, 0.0)
         student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
+        actions = _states_to_actions(student_states)
         teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
         _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
 
@@ -648,7 +653,7 @@ class Parser(TrainablePipe):
                     raise ValueError(Errors.E149) from None
         return self
 
-    def _init_batch(self, teacher_step_model, docs, max_length):
+    def _init_batch_from_teacher(self, teacher_pipe, docs, max_length):
         """Make a square batch of length equal to the shortest transition
         sequence or a cap. A long
         doc will get multiple states. Let's say we have a doc of length 2*N,
@@ -657,10 +662,12 @@ class Parser(TrainablePipe):
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
         cdef:
-            StateClass start_state
             StateClass state
-            Transition action
-        all_states = self.moves.init_batch(docs)
+            TransitionSystem moves = teacher_pipe.moves
+
+        # Start with the same heuristic as in supervised training: exclude
+        # docs that are within the maximum length.
+        all_states = moves.init_batch(docs)
         states = []
         to_cut = []
         for state, doc in zip(all_states, docs):
@@ -669,18 +676,28 @@ class Parser(TrainablePipe):
                     states.append(state)
                 else:
                     to_cut.append(state)
+
+        if not to_cut:
+            return states
+
+        # Parse the states that are too long with the teacher's parsing model.
+        teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
+            states=[state.copy() for state in to_cut])
+        (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs)
+
+        # Step through the teacher's actions and store every state after
+        # each multiple of max_length.
+        teacher_actions = _states_to_actions(teacher_states)
         while to_cut:
             states.extend(state.copy() for state in to_cut)
-            # Move states forward max_length actions.
-            length = 0
-            while to_cut and length < max_length:
-                teacher_scores = teacher_step_model.predict(to_cut)
-                self.transition_states(to_cut, teacher_scores)
-                # States that are completed do not need further cutting.
-                to_cut = [state for state in to_cut if not state.is_final()]
-                length += 1
-        return states
+            for step_actions in teacher_actions[:max_length]:
+                to_cut = moves.apply_actions(to_cut, step_actions)
+            teacher_actions = teacher_actions[max_length:]
+
+            if len(teacher_actions) < max_length:
+                break
 
+        return states
 
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
@@ -742,7 +759,7 @@ def _change_attrs(model, **kwargs):
             model.attrs[key] = value
 
 
-def states2actions(states: List[StateClass]) -> List[Ints1d]:
+def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
     cdef int step
     cdef StateClass state
     cdef StateC* c_state
@@ -763,3 +780,45 @@ def states2actions(states: List[StateClass]) -> List[Ints1d]:
         actions.append(numpy.array(step_actions, dtype="i"))
 
     return actions
+
+def _states_diff_to_actions(
+    before_states: List[StateClass],
+    after_states: List[StateClass]
+) -> List[Ints1d]:
+    """
+    Return for two sets of states the actions to go from the first set of
+    states to the second set of states. The histories of the first set of
+    states must be a prefix of the second set of states.
+    """
+    cdef StateClass before_state, after_state
+    cdef StateC* c_state_before
+    cdef StateC* c_state_after
+
+    assert len(before_states) == len(after_states)
+
+    # Check invariant: before states histories must be prefixes of after states.
+    for before_state, after_state in zip(before_states, after_states):
+        c_state_before = before_state.c
+        c_state_after = after_state.c
+
+        assert equal(c_state_before.history.begin(), c_state_before.history.end(),
+            c_state_after.history.begin())
+
+    actions = []
+    while True:
+        step = len(actions)
+
+        step_actions = []
+        for before_state, after_state in zip(before_states, after_states):
+            c_state_before = before_state.c
+            c_state_after = after_state.c
+            if step < c_state_after.history.size() - c_state_before.history.size():
+                step_actions.append(c_state_after.history[c_state_before.history.size() + step])
+
+        # We are done if we have exhausted all histories.
+        if len(step_actions) == 0:
+            break
+
+        actions.append(numpy.array(step_actions, dtype="i"))
+
+    return actions
diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py
new file mode 100644
index 00000000000..8c1cf7a9346
--- /dev/null
+++ b/spacy/tests/parser/test_model.py
@@ -0,0 +1,61 @@
+import numpy
+import pytest
+
+from spacy.lang.en import English
+from spacy.ml.tb_framework import TransitionModelInputs
+from spacy.training import Example
+
+TRAIN_DATA = [
+    (
+        "They trade mortgage-backed securities.",
+        {
+            "heads": [1, 1, 4, 4, 5, 1, 1],
+            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
+        },
+    ),
+    (
+        "I like London and Berlin.",
+        {
+            "heads": [1, 1, 1, 2, 2, 1],
+            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
+        },
+    ),
+]
+
+
+@pytest.fixture
+def nlp_parser():
+    nlp = English()
+    parser = nlp.add_pipe("parser")
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        for dep in annotations["deps"]:
+            parser.add_label(dep)
+    nlp.initialize()
+
+    return nlp, parser
+
+
+def test_incorrect_number_of_actions(nlp_parser):
+    nlp, parser = nlp_parser
+    doc = nlp.make_doc("test")
+
+    # Too many actions for the number of docs
+    with pytest.raises(AssertionError):
+        parser.model.predict(
+            TransitionModelInputs(
+                docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
+            )
+        )
+
+    # Too few actions for the number of docs
+    with pytest.raises(AssertionError):
+        parser.model.predict(
+            TransitionModelInputs(
+                docs=[doc, doc],
+                moves=parser.moves,
+                actions=[numpy.array([0], dtype="i")],
+            )
+        )
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index b2c39ae88bc..2c520b7daf6 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -624,7 +624,9 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -642,6 +644,7 @@ def test_distill():
 
     student = English()
     student_ner = student.add_pipe("ner")
+    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index a6e1852514d..4c709932bb1 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -460,7 +460,9 @@ def test_is_distillable():
     assert parser.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []
@@ -478,6 +480,7 @@ def test_distill():
 
     student = English()
     student_parser = student.add_pipe("parser")
+    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
     student_parser.initialize(
         get_examples=lambda: train_examples, labels=teacher_parser.label_data
     )

From 463eefc47d5a69a666691d110ff3afe808d2f9d2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 23 Feb 2023 11:36:50 +0100
Subject: [PATCH 171/504] Remove backoff from .vector to .tensor (#12292)

---
 spacy/tokens/doc.pyx                           |  3 ---
 spacy/tokens/span.pyx                          |  2 --
 spacy/tokens/token.pyx                         |  6 +-----
 website/docs/usage/101/_vectors-similarity.mdx | 15 +++++++++------
 4 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 79bb965bb3c..d44e83182f8 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -702,9 +702,6 @@ cdef class Doc:
             elif self.vocab.vectors.size > 0:
                 self._vector = sum(t.vector for t in self) / len(self)
                 return self._vector
-            elif self.tensor.size > 0:
-                self._vector = self.tensor.mean(axis=0)
-                return self._vector
             else:
                 return xp.zeros((self.vocab.vectors_length,), dtype="float32")
 
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 883a67f3dd6..c439c8655dc 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -561,8 +561,6 @@ cdef class Span:
             return self.doc.user_span_hooks["has_vector"](self)
         elif self.vocab.vectors.size > 0:
             return any(token.has_vector for token in self)
-        elif self.doc.tensor.size > 0:
-            return True
         else:
             return False
 
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 3a7ce45c54a..6c4806ff9cb 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -415,9 +415,7 @@ cdef class Token:
         """
         if "has_vector" in self.doc.user_token_hooks:
             return self.doc.user_token_hooks["has_vector"](self)
-        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
-            return True
-        return self.vocab.has_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
+        return self.vocab.has_vector(self.c.lex.orth)
 
     @property
     def vector(self):
@@ -430,8 +428,6 @@ cdef class Token:
         """
         if "vector" in self.doc.user_token_hooks:
             return self.doc.user_token_hooks["vector"](self)
-        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
-            return self.doc.tensor[self.i]
         else:
             return self.vocab.get_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
 
diff --git a/website/docs/usage/101/_vectors-similarity.mdx b/website/docs/usage/101/_vectors-similarity.mdx
index 6deab926d25..39ee8e48a43 100644
--- a/website/docs/usage/101/_vectors-similarity.mdx
+++ b/website/docs/usage/101/_vectors-similarity.mdx
@@ -22,17 +22,20 @@ array([2.02280000e-01,  -7.66180009e-02,   3.70319992e-01,
 <Infobox title="Important note" variant="warning">
 
 To make them compact and fast, spaCy's small [pipeline packages](/models) (all
-packages that end in `sm`) **don't ship with word vectors**, and only include
-context-sensitive **tensors**. This means you can still use the `similarity()`
-methods to compare documents, spans and tokens – but the result won't be as
-good, and individual tokens won't have any vectors assigned. So in order to use
-_real_ word vectors, you need to download a larger pipeline package:
+packages that end in `sm`) **don't ship with word vectors**. In order to use
+`similarity()`, you need to download a larger pipeline package that includes
+vectors:
 
 ```diff
 - python -m spacy download en_core_web_sm
-+ python -m spacy download en_core_web_lg
++ python -m spacy download en_core_web_md
 ```
 
+In spaCy v3 and earlier, small pipeline packages supported `similarity()` by
+backing off to context-sensitive tensors from the `tok2vec` component. These
+tensors do not work well for this purpose and this backoff has been removed in
+spaCy v4.
+
 </Infobox>
 
 Pipeline packages that come with built-in word vectors make them available as

From 4a3d1e9f0c467376a1b9509c4fb857cd896cfe7d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 1 Mar 2023 16:00:02 +0100
Subject: [PATCH 172/504] Return Tuple[Span] for all Doc/Span attrs that
 provide spans (#12288)

* Return Tuple[Span] for all Doc/Span attrs that provide spans

* Update Span types
---
 spacy/tokens/doc.pyi      |  4 ++--
 spacy/tokens/doc.pyx      | 23 +++++++++++------------
 spacy/tokens/span.pyi     |  4 +++-
 spacy/tokens/span.pyx     | 28 ++++++++++++++++------------
 website/docs/api/doc.mdx  | 23 +++++++++++------------
 website/docs/api/span.mdx | 33 ++++++++++++++++-----------------
 6 files changed, 59 insertions(+), 56 deletions(-)

diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 11f8a1c5eb8..2b39d5baa28 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -155,9 +155,9 @@ class Doc:
         default: str = ...,
     ) -> None: ...
     @property
-    def noun_chunks(self) -> Iterator[Span]: ...
+    def noun_chunks(self) -> Tuple[Span]: ...
     @property
-    def sents(self) -> Iterator[Span]: ...
+    def sents(self) -> Tuple[Span]: ...
     @property
     def lang(self) -> int: ...
     @property
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index d44e83182f8..893ba9c2cda 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -748,10 +748,10 @@ cdef class Doc:
         return self.text
 
     property ents:
-        """The named entities in the document. Returns a tuple of named entity
+        """The named entities in the document. Returns a list of named entity
         `Span` objects, if the entity recognizer has been applied.
 
-        RETURNS (tuple): Entities in the document, one `Span` per entity.
+        RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity.
 
         DOCS: https://spacy.io/api/doc#ents
         """
@@ -909,7 +909,7 @@ cdef class Doc:
         NP-level coordination, no prepositional phrases, and no relative
         clauses.
 
-        YIELDS (Span): Noun chunks in the document.
+        RETURNS (Tuple[Span]): Noun chunks in the document.
 
         DOCS: https://spacy.io/api/doc#noun_chunks
         """
@@ -918,36 +918,35 @@ cdef class Doc:
 
         # Accumulate the result before beginning to iterate over it. This
         # prevents the tokenization from being changed out from under us
-        # during the iteration. The tricky thing here is that Span accepts
-        # its tokenization changing, so it's okay once we have the Span
-        # objects. See Issue #375.
+        # during the iteration.
         spans = []
         for start, end, label in self.noun_chunks_iterator(self):
             spans.append(Span(self, start, end, label=label))
-        for span in spans:
-            yield span
+        return tuple(spans)
 
     @property
     def sents(self):
         """Iterate over the sentences in the document. Yields sentence `Span`
         objects. Sentence spans have no label.
 
-        YIELDS (Span): Sentences in the document.
+        RETURNS (Tuple[Span]): Sentences in the document.
 
         DOCS: https://spacy.io/api/doc#sents
         """
         if not self.has_annotation("SENT_START"):
             raise ValueError(Errors.E030)
         if "sents" in self.user_hooks:
-            yield from self.user_hooks["sents"](self)
+            return tuple(self.user_hooks["sents"](self))
         else:
             start = 0
+            spans = []
             for i in range(1, self.length):
                 if self.c[i].sent_start == 1:
-                    yield Span(self, start, i)
+                    spans.append(Span(self, start, i))
                     start = i
             if start != self.length:
-                yield Span(self, start, self.length)
+                spans.append(Span(self, start, self.length))
+            return tuple(spans)
 
     @property
     def lang(self):
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 3c85542bb3d..2a529593e5f 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -81,6 +81,8 @@ class Span:
     @property
     def ents(self) -> Tuple[Span]: ...
     @property
+    def sents(self) -> Tuple[Span]: ...
+    @property
     def has_vector(self) -> bool: ...
     @property
     def vector(self) -> Floats1d: ...
@@ -93,7 +95,7 @@ class Span:
     @property
     def text_with_ws(self) -> str: ...
     @property
-    def noun_chunks(self) -> Iterator[Span]: ...
+    def noun_chunks(self) -> Tuple[Span]: ...
     @property
     def root(self) -> Token: ...
     def char_span(
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index c439c8655dc..1378889c681 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -487,20 +487,21 @@ cdef class Span:
         """Obtain the sentences that contain this span. If the given span
         crosses sentence boundaries, return all sentences it is a part of.
 
-        RETURNS (Iterable[Span]): All sentences that the span is a part of.
+        RETURNS (Tuple[Span]): All sentences that the span is a part of.
 
-         DOCS: https://spacy.io/api/span#sents
+        DOCS: https://spacy.io/api/span#sents
         """
         cdef int start
         cdef int i
 
         if "sents" in self.doc.user_span_hooks:
-            yield from self.doc.user_span_hooks["sents"](self)
-        elif "sents" in self.doc.user_hooks:
+            return tuple(self.doc.user_span_hooks["sents"](self))
+        spans = []
+        if "sents" in self.doc.user_hooks:
             for sentence in self.doc.user_hooks["sents"](self.doc):
                 if sentence.end > self.start:
                     if sentence.start < self.end or sentence.start == self.start == self.end:
-                        yield sentence
+                        spans.append(sentence)
                     else:
                         break
         else:
@@ -515,12 +516,13 @@ cdef class Span:
             # Now, find all the sentences in the span
             for i in range(start + 1, self.doc.length):
                 if self.doc.c[i].sent_start == 1:
-                    yield Span(self.doc, start, i)
+                    spans.append(Span(self.doc, start, i))
                     start = i
                     if start >= self.end:
                         break
-                elif i == self.doc.length - 1:
-                    yield Span(self.doc, start, self.doc.length)
+            if start < self.end:
+                spans.append(Span(self.doc, start, self.end))
+        return tuple(spans)
 
             # Ensure that trailing parts of the Span instance are included in last element of .sents.
             if start == self.doc.length - 1:
@@ -531,7 +533,7 @@ cdef class Span:
         """The named entities that fall completely within the span. Returns
         a tuple of `Span` objects.
 
-        RETURNS (tuple): Entities in the span, one `Span` per entity.
+        RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity.
 
         DOCS: https://spacy.io/api/span#ents
         """
@@ -546,7 +548,7 @@ cdef class Span:
                     ents.append(ent)
                 else:
                     break
-        return ents
+        return tuple(ents)
 
     @property
     def has_vector(self):
@@ -641,13 +643,15 @@ cdef class Span:
         NP-level coordination, no prepositional phrases, and no relative
         clauses.
 
-        YIELDS (Span): Noun chunks in the span.
+        RETURNS (Tuple[Span]): Noun chunks in the span.
 
         DOCS: https://spacy.io/api/span#noun_chunks
         """
+        spans = []
         for span in self.doc.noun_chunks:
             if span.start >= self.start and span.end <= self.end:
-                yield span
+                spans.append(span)
+        return tuple(spans)
 
     @property
     def root(self):
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index f53e209afc8..e92c0e833e0 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -654,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer).
 
 ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
 
-Iterate over the base noun phrases in the document. Yields base noun-phrase
-`Span` objects, if the document has been syntactically parsed. A base noun
-phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
-nested within it – so no NP-level coordination, no prepositional phrases, and no
-relative clauses.
+Returns a tuple of the base noun phrases in the doc, if the document has been
+syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
+does not permit other NPs to be nested within it – so no NP-level coordination,
+no prepositional phrases, and no relative clauses.
 
 To customize the noun chunk iterator in a loaded pipeline, modify
 [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
@@ -675,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised.
 > assert chunks[1].text == "another phrase"
 > ```
 
-| Name       | Description                           |
-| ---------- | ------------------------------------- |
-| **YIELDS** | Noun chunks in the document. ~~Span~~ |
+| Name        | Description                                  |
+| ----------- | -------------------------------------------- |
+| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
 
 ## Doc.sents {id="sents",tag="property",model="sentences"}
 
-Iterate over the sentences in the document. Sentence spans have no label.
+Returns a tuple of the sentences in the document. Sentence spans have no label.
 
 This property is only available when
 [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
@@ -697,9 +696,9 @@ will raise an error otherwise.
 > assert [s.root.text for s in sents] == ["is", "'s"]
 > ```
 
-| Name       | Description                         |
-| ---------- | ----------------------------------- |
-| **YIELDS** | Sentences in the document. ~~Span~~ |
+| Name        | Description                                |
+| ----------- | ------------------------------------------ |
+| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
 
 ## Doc.has_vector {id="has_vector",tag="property",model="vectors"}
 
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index ae7ef7203b6..cd70d8dcead 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -275,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of
 > assert ents[0].text == "Mr. Best"
 > ```
 
-| Name        | Description                                                       |
-| ----------- | ----------------------------------------------------------------- |
-| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
+| Name        | Description                                                  |
+| ----------- | ------------------------------------------------------------ |
+| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ |
 
 ## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"}
 
-Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
-objects, if the document has been syntactically parsed. A base noun phrase, or
-"NP chunk", is a noun phrase that does not permit other NPs to be nested within
-it – so no NP-level coordination, no prepositional phrases, and no relative
-clauses.
+Returns a tuple of the base noun phrases in the span if the document has been
+syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
+does not permit other NPs to be nested within it – so no NP-level coordination,
+no prepositional phrases, and no relative clauses.
 
 If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
 has not been implemeted for the given language, a `NotImplementedError` is
@@ -301,9 +300,9 @@ raised.
 > assert chunks[0].text == "another phrase"
 > ```
 
-| Name       | Description                       |
-| ---------- | --------------------------------- |
-| **YIELDS** | Noun chunks in the span. ~~Span~~ |
+| Name        | Description                              |
+| ----------- | ---------------------------------------- |
+| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ |
 
 ## Span.as_doc {id="as_doc",tag="method"}
 
@@ -525,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)]
 
 ## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"}
 
-Returns a generator over the sentences the span belongs to. This property is
-only available when [sentence boundaries](/usage/linguistic-features#sbd) have
-been set on the document by the `parser`, `senter`, `sentencizer` or some custom
+Returns a tuple of the sentences the span belongs to. This property is only
+available when [sentence boundaries](/usage/linguistic-features#sbd) have been
+set on the document by the `parser`, `senter`, `sentencizer` or some custom
 function. It will raise an error otherwise.
 
 If the span happens to cross sentence boundaries, all sentences the span
@@ -541,9 +540,9 @@ overlaps with will be returned.
 > assert len(span.sents) == 2
 > ```
 
-| Name        | Description                                                                |
-| ----------- | -------------------------------------------------------------------------- |
-| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
+| Name        | Description                                                   |
+| ----------- | ------------------------------------------------------------- |
+| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ |
 
 ## Attributes {id="attributes"}
 

From 6450b3234ab0597040fc41a53ca4e626f96bcd56 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Tue, 7 Mar 2023 13:10:45 +0100
Subject: [PATCH 173/504] Drop support for EntityLinker_v1. (#12377)

---
 spacy/errors.py                            |  1 +
 spacy/pipeline/entity_linker.py            | 23 ++--------------------
 spacy/tests/pipeline/test_entity_linker.py |  7 +------
 3 files changed, 4 insertions(+), 27 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index c8c595395b3..83a1e9ba2c0 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -980,6 +980,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4003 = ("Training examples for distillation must have the exact same tokens in the "
              "reference and predicted docs.")
     E4004 = ("Backprop is not supported when is_train is not set.")
+    E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 21e3a279749..546bd9f6e2a 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -132,28 +132,9 @@ def make_entity_linker(
         prediction is discarded. If None, predictions are not filtered by any threshold.
     save_activations (bool): save model activations in Doc when annotating.
     """
-
     if not model.attrs.get("include_span_maker", False):
-        try:
-            from spacy_legacy.components.entity_linker import EntityLinker_v1
-        except:
-            raise ImportError(
-                "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
-            )
-        # The only difference in arguments here is that use_gold_ents and threshold aren't available.
-        return EntityLinker_v1(
-            nlp.vocab,
-            model,
-            name,
-            labels_discard=labels_discard,
-            n_sents=n_sents,
-            incl_prior=incl_prior,
-            incl_context=incl_context,
-            entity_vector_length=entity_vector_length,
-            get_candidates=get_candidates,
-            overwrite=overwrite,
-            scorer=scorer,
-        )
+        raise ValueError(Errors.E4005)
+
     return EntityLinker(
         nlp.vocab,
         model,
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 9e955f23e43..f28a4c9d5b9 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -980,13 +980,11 @@ def test_scorer_links():
 @pytest.mark.parametrize(
     "name,config",
     [
-        ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
         ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
     ],
 )
 # fmt: on
 def test_legacy_architectures(name, config):
-    from spacy_legacy.components.entity_linker import EntityLinker_v1
 
     # Ensure that the legacy architectures still work
     vector_length = 3
@@ -1009,10 +1007,7 @@ def create_kb(vocab):
         return mykb
 
     entity_linker = nlp.add_pipe(name, config={"model": config})
-    if config["@architectures"] == "spacy.EntityLinker.v1":
-        assert isinstance(entity_linker, EntityLinker_v1)
-    else:
-        assert isinstance(entity_linker, EntityLinker)
+    assert isinstance(entity_linker, EntityLinker)
     entity_linker.set_kb(create_kb)
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
 

From 4e98de345d2b74651f502af8fcf4e02dd4cddb3f Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 9 Mar 2023 09:37:19 +0100
Subject: [PATCH 174/504] `Tok2Vec`: Add `distill` method (#12108)

* `Tok2Vec`: Add `distill` method

* `Tok2Vec`: Refactor `update`

* Add `Tok2Vec.distill` test

* Update `distill` signature to accept `Example`s instead of separate teacher and student docs

* Add docs

* Remove docstring

* Update test

* Remove `update` calls from test

* Update `Tok2Vec.distill` docstring
---
 spacy/pipeline/tok2vec.py            | 125 ++++++++++++++++++++-------
 spacy/tests/pipeline/test_tok2vec.py | 117 +++++++++++++++----------
 website/docs/api/tok2vec.mdx         |  37 ++++++++
 3 files changed, 204 insertions(+), 75 deletions(-)

diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 677f5eec16c..f168aee2ec4 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -1,3 +1,6 @@
+from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple
+from thinc.api import Model, set_dropout_rate, Optimizer, Config
+from thinc.types import Floats2d
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence
 
@@ -158,39 +161,9 @@ def update(
 
         DOCS: https://spacy.io/api/tok2vec#update
         """
-        if losses is None:
-            losses = {}
         validate_examples(examples, "Tok2Vec.update")
         docs = [eg.predicted for eg in examples]
-        set_dropout_rate(self.model, drop)
-        tokvecs, bp_tokvecs = self.model.begin_update(docs)
-        d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
-        losses.setdefault(self.name, 0.0)
-
-        def accumulate_gradient(one_d_tokvecs):
-            """Accumulate tok2vec loss and gradient. This is passed as a callback
-            to all but the last listener. Only the last one does the backprop.
-            """
-            nonlocal d_tokvecs
-            for i in range(len(one_d_tokvecs)):
-                d_tokvecs[i] += one_d_tokvecs[i]
-                losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
-            return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
-
-        def backprop(one_d_tokvecs):
-            """Callback to actually do the backprop. Passed to last listener."""
-            accumulate_gradient(one_d_tokvecs)
-            d_docs = bp_tokvecs(d_tokvecs)
-            if sgd is not None:
-                self.finish_update(sgd)
-            return d_docs
-
-        batch_id = Tok2VecListener.get_batch_id(docs)
-        for listener in self.listeners[:-1]:
-            listener.receive(batch_id, tokvecs, accumulate_gradient)
-        if self.listeners:
-            self.listeners[-1].receive(batch_id, tokvecs, backprop)
-        return losses
+        return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses)
 
     def get_loss(self, examples, scores) -> None:
         pass
@@ -220,6 +193,96 @@ def initialize(
     def add_label(self, label):
         raise NotImplementedError
 
+    def distill(
+        self,
+        teacher_pipe: Optional["TrainablePipe"],
+        examples: Iterable["Example"],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+    ) -> Dict[str, float]:
+        """Performs an update of the student pipe's model using the
+        student's distillation examples and sets the annotations
+        of the teacher's distillation examples using the teacher pipe.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use
+            for prediction.
+        examples (Iterable[Example]): Distillation examples. The reference (teacher)
+            and predicted (student) docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+
+        DOCS: https://spacy.io/api/tok2vec#distill
+        """
+        # By default we require a teacher pipe, but there are downstream
+        # implementations that don't require a pipe.
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        teacher_docs = [eg.reference for eg in examples]
+        student_docs = [eg.predicted for eg in examples]
+        teacher_preds = teacher_pipe.predict(teacher_docs)
+        teacher_pipe.set_annotations(teacher_docs, teacher_preds)
+        return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses)
+
+    def _update_with_docs(
+        self,
+        docs: Iterable[Doc],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+    ):
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        set_dropout_rate(self.model, drop)
+
+        tokvecs, accumulate_gradient, backprop = self._create_backprops(
+            docs, losses, sgd=sgd
+        )
+        batch_id = Tok2VecListener.get_batch_id(docs)
+        for listener in self.listeners[:-1]:
+            listener.receive(batch_id, tokvecs, accumulate_gradient)
+        if self.listeners:
+            self.listeners[-1].receive(batch_id, tokvecs, backprop)
+        return losses
+
+    def _create_backprops(
+        self,
+        docs: Iterable[Doc],
+        losses: Dict[str, float],
+        *,
+        sgd: Optional[Optimizer] = None,
+    ) -> Tuple[Floats2d, Callable, Callable]:
+        tokvecs, bp_tokvecs = self.model.begin_update(docs)
+        d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
+
+        def accumulate_gradient(one_d_tokvecs):
+            """Accumulate tok2vec loss and gradient. This is passed as a callback
+            to all but the last listener. Only the last one does the backprop.
+            """
+            nonlocal d_tokvecs
+            for i in range(len(one_d_tokvecs)):
+                d_tokvecs[i] += one_d_tokvecs[i]
+                losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
+            return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
+
+        def backprop(one_d_tokvecs):
+            """Callback to actually do the backprop. Passed to last listener."""
+            accumulate_gradient(one_d_tokvecs)
+            d_docs = bp_tokvecs(d_tokvecs)
+            if sgd is not None:
+                self.finish_update(sgd)
+            return d_docs
+
+        return tokvecs, accumulate_gradient, backprop
+
 
 class Tok2VecListener(Model):
     """A layer that gets fed its answers from an upstream connection,
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 9648341a106..e557e294112 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -561,55 +561,84 @@ def test_tok2vec_listeners_textcat():
     assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]
 
 
-def test_tok2vec_listener_source_link_name():
-    """The component's internal name and the tok2vec listener map correspond
-    to the most recently modified pipeline.
-    """
-    orig_config = Config().from_str(cfg_string_multi)
-    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
-    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+cfg_string_distillation = """
+    [nlp]
+    lang = "en"
+    pipeline = ["tok2vec","tagger"]
+
+    [components]
+
+    [components.tagger]
+    factory = "tagger"
 
-    nlp2 = English()
-    nlp2.add_pipe("tok2vec", source=nlp1)
-    nlp2.add_pipe("tagger", name="tagger2", source=nlp1)
+    [components.tagger.model]
+    @architectures = "spacy.Tagger.v2"
+    nO = null
+
+    [components.tagger.model.tok2vec]
+    @architectures = "spacy.Tok2VecListener.v1"
+    width = ${components.tok2vec.model.encode.width}
+
+    [components.tok2vec]
+    factory = "tok2vec"
 
-    # there is no way to have the component have the right name for both
-    # pipelines, right now the most recently modified pipeline is prioritized
-    assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2"
+    [components.tok2vec.model]
+    @architectures = "spacy.Tok2Vec.v2"
 
-    # there is no way to have the tok2vec have the right listener map for both
-    # pipelines, right now the most recently modified pipeline is prioritized
-    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
-    nlp2.add_pipe("ner", name="ner3", source=nlp1)
-    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"]
-    nlp2.remove_pipe("ner3")
-    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
-    nlp2.remove_pipe("tagger2")
-    assert nlp2.get_pipe("tok2vec").listening_components == []
+    [components.tok2vec.model.embed]
+    @architectures = "spacy.MultiHashEmbed.v2"
+    width = ${components.tok2vec.model.encode.width}
+    rows = [2000, 1000, 1000, 1000]
+    attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+    include_static_vectors = false
 
-    # at this point the tok2vec component corresponds to nlp2
-    assert nlp1.get_pipe("tok2vec").listening_components == []
+    [components.tok2vec.model.encode]
+    @architectures = "spacy.MaxoutWindowEncoder.v2"
+    width = 96
+    depth = 4
+    window_size = 1
+    maxout_pieces = 3
+    """
 
-    # modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1
-    nlp1.add_pipe("sentencizer")
-    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
 
-    # modifying nlp2 syncs it back to nlp2
-    nlp2.add_pipe("sentencizer")
-    assert nlp1.get_pipe("tok2vec").listening_components == []
+def test_tok2vec_distillation_teacher_annotations():
+    orig_config = Config().from_str(cfg_string_distillation)
+    teacher_nlp = util.load_model_from_config(
+        orig_config, auto_fill=True, validate=True
+    )
+    student_nlp = util.load_model_from_config(
+        orig_config, auto_fill=True, validate=True
+    )
 
+    train_examples_teacher = []
+    train_examples_student = []
+    for t in TRAIN_DATA:
+        train_examples_teacher.append(
+            Example.from_dict(teacher_nlp.make_doc(t[0]), t[1])
+        )
+        train_examples_student.append(
+            Example.from_dict(student_nlp.make_doc(t[0]), t[1])
+        )
 
-def test_tok2vec_listener_source_replace_listeners():
-    orig_config = Config().from_str(cfg_string_multi)
-    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
-    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
-    nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
-    assert nlp1.get_pipe("tok2vec").listening_components == ["ner"]
-
-    nlp2 = English()
-    nlp2.add_pipe("tok2vec", source=nlp1)
-    assert nlp2.get_pipe("tok2vec").listening_components == []
-    nlp2.add_pipe("tagger", source=nlp1)
-    assert nlp2.get_pipe("tok2vec").listening_components == []
-    nlp2.add_pipe("ner", name="ner2", source=nlp1)
-    assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"]
+    optimizer = teacher_nlp.initialize(lambda: train_examples_teacher)
+    student_nlp.initialize(lambda: train_examples_student)
+
+    # Since Language.distill creates a copy of the examples to use as
+    # its internal teacher/student docs, we'll need to monkey-patch the
+    # tok2vec pipe's distill method.
+    student_tok2vec = student_nlp.get_pipe("tok2vec")
+    student_tok2vec._old_distill = student_tok2vec.distill
+
+    def tok2vec_distill_wrapper(
+        self,
+        teacher_pipe,
+        examples,
+        **kwargs,
+    ):
+        assert all(not eg.reference.tensor.any() for eg in examples)
+        out = self._old_distill(teacher_pipe, examples, **kwargs)
+        assert all(eg.reference.tensor.any() for eg in examples)
+        return out
+
+    student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec)
+    student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={})
diff --git a/website/docs/api/tok2vec.mdx b/website/docs/api/tok2vec.mdx
index a1bb1265eae..8b6d2380bae 100644
--- a/website/docs/api/tok2vec.mdx
+++ b/website/docs/api/tok2vec.mdx
@@ -100,6 +100,43 @@ pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## Tok2Vec.distill {id="distill", tag="method,experimental", version="4"}
+
+Performs an update of the student pipe's model using the student's distillation 
+examples and sets the annotations of the teacher's distillation examples using 
+the teacher pipe. 
+
+Unlike other trainable pipes, the student pipe doesn't directly learn its 
+representations from the teacher. However, since downstream pipes that do 
+perform distillation expect the tok2vec annotations to be present on the 
+correct distillation examples, we need to ensure that they are set beforehand.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("tok2vec")
+> student_pipe = student.add_pipe("tok2vec")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to use for prediction. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## Tok2Vec.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood

From 3401b0150a26858ca9b0ada128c926d1adb53218 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 19 Mar 2023 23:41:20 +0100
Subject: [PATCH 175/504] Clean up Vocab constructor (#12290)

* Clean up Vocab constructor

* Change effective type of `strings` from `Iterable[str]` to `Optional[StringStore]`
  * Don't automatically add strings to vocab
* Change default values to `None`
* Remove `**deprecated_kwargs`

* Format
---
 spacy/strings.pyi                             |  2 +-
 spacy/tests/pipeline/test_pipe_methods.py     |  3 ++-
 .../serialize/test_serialize_vocab_strings.py | 27 +++++++++++--------
 spacy/tests/vocab_vectors/test_lexeme.py      |  2 +-
 spacy/vocab.pyi                               |  2 +-
 spacy/vocab.pyx                               | 18 +++++++------
 website/docs/api/vocab.mdx                    |  5 ++--
 7 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/spacy/strings.pyi b/spacy/strings.pyi
index 8b7c0d6bd5a..393661f591d 100644
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@@ -3,7 +3,7 @@ from pathlib import Path
 from typing import Any, Iterable, Iterator, Optional, Union, overload
 
 class StringStore:
-    def __init__(self, strings: Optional[Iterable[str]]) -> None: ...
+    def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ...
     @overload
     def __getitem__(self, string_or_hash: str) -> int: ...
     @overload
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 9b9786f0458..39611a74278 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -9,6 +9,7 @@
 from spacy.lang.en.syntax_iterators import noun_chunks
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
+from spacy.strings import StringStore
 from spacy.tokens import Doc
 from spacy.training import Example
 from spacy.util import SimpleFrozenList, get_arg_names, make_tempdir
@@ -131,7 +132,7 @@ def test_issue5458():
     # Test that the noun chuncker does not generate overlapping spans
     # fmt: off
     words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
-    vocab = Vocab(strings=words)
+    vocab = Vocab(strings=StringStore(words))
     deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
     pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
     heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py
index fd80c3d8e4f..f6356ac9e01 100644
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@@ -13,8 +13,11 @@
 
 from ..util import make_tempdir
 
-test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
-test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
+test_strings = [
+    (StringStore(), StringStore()),
+    (StringStore(["rats", "are", "cute"]), StringStore(["i", "like", "rats"])),
+]
+test_strings_attrs = [(StringStore(["rats", "are", "cute"]), "Hello")]
 
 
 @pytest.mark.issue(599)
@@ -81,7 +84,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
     vocab2 = Vocab(strings=strings2)
     vocab1_b = vocab1.to_bytes()
     vocab2_b = vocab2.to_bytes()
-    if strings1 == strings2:
+    if strings1.to_bytes() == strings2.to_bytes():
         assert vocab1_b == vocab2_b
     else:
         assert vocab1_b != vocab2_b
@@ -117,11 +120,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
 def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
     vocab1 = Vocab(strings=strings)
     vocab2 = Vocab()
-    vocab1[strings[0]].norm_ = lex_attr
-    assert vocab1[strings[0]].norm_ == lex_attr
-    assert vocab2[strings[0]].norm_ != lex_attr
+    s = next(iter(vocab1.strings))
+    vocab1[s].norm_ = lex_attr
+    assert vocab1[s].norm_ == lex_attr
+    assert vocab2[s].norm_ != lex_attr
     vocab2 = vocab2.from_bytes(vocab1.to_bytes())
-    assert vocab2[strings[0]].norm_ == lex_attr
+    assert vocab2[s].norm_ == lex_attr
 
 
 @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@@ -136,14 +140,15 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
 def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
     vocab1 = Vocab(strings=strings)
     vocab2 = Vocab()
-    vocab1[strings[0]].norm_ = lex_attr
-    assert vocab1[strings[0]].norm_ == lex_attr
-    assert vocab2[strings[0]].norm_ != lex_attr
+    s = next(iter(vocab1.strings))
+    vocab1[s].norm_ = lex_attr
+    assert vocab1[s].norm_ == lex_attr
+    assert vocab2[s].norm_ != lex_attr
     with make_tempdir() as d:
         file_path = d / "vocab"
         vocab1.to_disk(file_path)
         vocab2 = vocab2.from_disk(file_path)
-    assert vocab2[strings[0]].norm_ == lex_attr
+    assert vocab2[s].norm_ == lex_attr
 
 
 @pytest.mark.parametrize("strings1,strings2", test_strings)
diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py
index 156e3391aa2..dc2c80bcdd0 100644
--- a/spacy/tests/vocab_vectors/test_lexeme.py
+++ b/spacy/tests/vocab_vectors/test_lexeme.py
@@ -18,7 +18,7 @@ def test_issue361(en_vocab, text1, text2):
 
 @pytest.mark.issue(600)
 def test_issue600():
-    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
+    vocab = Vocab()
     doc = Doc(vocab, words=["hello"])
     doc[0].tag_ = "NN"
 
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
index 7fbb9764f10..beb7febee63 100644
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@@ -27,7 +27,7 @@ class Vocab:
     def __init__(
         self,
         lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ...,
-        strings: Optional[Union[List[str], StringStore]] = ...,
+        strings: Optional[StringStore] = ...,
         lookups: Optional[Lookups] = ...,
         oov_prob: float = ...,
         writing_system: Dict[str, Any] = ...,
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 3145f51844a..3ccfa6db622 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -46,9 +46,8 @@ cdef class Vocab:
 
     DOCS: https://spacy.io/api/vocab
     """
-    def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
-                 oov_prob=-20., writing_system={}, get_noun_chunks=None,
-                 **deprecated_kwargs):
+    def __init__(self, lex_attr_getters=None, strings=None, lookups=None,
+            oov_prob=-20., writing_system=None, get_noun_chunks=None):
         """Create the vocabulary.
 
         lex_attr_getters (dict): A dictionary mapping attribute IDs to
@@ -66,16 +65,19 @@ cdef class Vocab:
         self.cfg = {'oov_prob': oov_prob}
         self.mem = Pool()
         self._by_orth = PreshMap()
-        self.strings = StringStore()
         self.length = 0
-        if strings:
-            for string in strings:
-                _ = self[string]
+        if strings is None:
+            self.strings = StringStore()
+        else:
+            self.strings = strings
         self.lex_attr_getters = lex_attr_getters
         self.morphology = Morphology(self.strings)
         self.vectors = Vectors(strings=self.strings)
         self.lookups = lookups
-        self.writing_system = writing_system
+        if writing_system is None:
+            self.writing_system = {}
+        else:
+            self.writing_system = writing_system
         self.get_noun_chunks = get_noun_chunks
 
     property vectors:
diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx
index 36369c78427..88d3939142f 100644
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@@ -24,14 +24,15 @@ Create the vocabulary.
 > #### Example
 >
 > ```python
+> from spacy.strings import StringStore
 > from spacy.vocab import Vocab
-> vocab = Vocab(strings=["hello", "world"])
+> vocab = Vocab(strings=StringStore(["hello", "world"]))
 > ```
 
 | Name               | Description                                                                                                                                                             |
 | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~                                      |
-| `strings`          | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~                           |
+| `strings`          | A [`StringStore`](/api/stringstore) that maps strings to hash values. ~~Optional[StringStore]~~                                                                         |
 | `lookups`          | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~                                      |
 | `oov_prob`         | The default OOV probability. Defaults to `-20.0`. ~~float~~                                                                                                             |
 | `writing_system`   | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~                          |

From 609e3b90ab1173a7cb60b681112abaada4ecf321 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 20 Mar 2023 00:34:35 +0100
Subject: [PATCH 176/504] Introduce hierarchy for EL `Candidate` objects
 (#12341)

* Convert Candidate from Cython to Python class.

* Format.

* Fix .entity_ typo in _add_activations() usage.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update doc string of BaseCandidate.__init__().

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate.

* Adjust Candidate to support and mandate numerical entity IDs.

* Format.

* Fix docstring and docs.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename alias -> mention.

* Refactor Candidate attribute names. Update docs and tests accordingly.

* Refacor Candidate attributes and their usage.

* Format.

* Fix mypy error.

* Update error code in line with v4 convention.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Updated error code.

* Simplify interface for int/str representations.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename 'alias' to 'mention'.

* Port Candidate and InMemoryCandidate to Cython.

* Remove redundant entry in setup.py.

* Add abstract class check.

* Drop storing mention.

* Update spacy/kb/candidate.pxd

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fix entity_id refactoring problems in docstrings.

* Drop unused InMemoryCandidate._entity_hash.

* Update docstrings.

* Move attributes out of Candidate.

* Partially fix alias/mention terminology usage. Convert Candidate to interface.

* Remove prior_prob from supported properties in Candidate. Introduce KnowledgeBase.supports_prior_probs().

* Update docstrings related to prior_prob.

* Update alias/mention usage in doc(strings).

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Mention -> alias renaming. Drop Candidate.mentions(). Drop InMemoryLookupKB.get_alias_candidates() from docs.

* Update docstrings.

* Fix InMemoryCandidate attribute names.

* Update spacy/kb/kb.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update W401 test.

* Update spacy/errors.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/kb/kb.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Use Candidate output type for toy generators in the test suite to mimick best practices

* fix docs

* fix import

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/errors.py                            |   8 +-
 spacy/kb/__init__.py                       |  10 +-
 spacy/kb/candidate.pxd                     |  22 ++--
 spacy/kb/candidate.pyx                     | 135 +++++++++++----------
 spacy/kb/kb.pyx                            |  23 ++--
 spacy/kb/kb_in_memory.pyx                  |  36 +++---
 spacy/ml/models/entity_linker.py           |  28 +++++
 spacy/pipeline/entity_linker.py            |  61 ++++++++--
 spacy/tests/pipeline/test_entity_linker.py |  48 ++++----
 spacy/tests/serialize/test_serialize_kb.py |  12 +-
 website/docs/api/inmemorylookupkb.mdx      |  40 ++----
 website/docs/api/kb.mdx                    |  51 +++-----
 12 files changed, 263 insertions(+), 211 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 83a1e9ba2c0..42fdc12e029 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes):
             "ignoring the duplicate entry.")
     W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
             "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
-    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
+    W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
             "the Knowledge Base.")
     W026 = ("Unable to set all sentence boundaries from dependency parses. If "
             "you are constructing a parse tree incrementally by setting "
@@ -214,7 +214,11 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
+    # v4 warning strings
     W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
+    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
+            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
+            "to return `True` in `.supports_prior_probs`.")
 
 
 class Errors(metaclass=ErrorsWithCodes):
@@ -981,6 +985,8 @@ class Errors(metaclass=ErrorsWithCodes):
              "reference and predicted docs.")
     E4004 = ("Backprop is not supported when is_train is not set.")
     E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
+    E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
+
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
index 93a65ab6194..fb21083ddee 100644
--- a/spacy/kb/__init__.py
+++ b/spacy/kb/__init__.py
@@ -1,11 +1,7 @@
 from .candidate import Candidate, get_candidates, get_candidates_batch
 from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB
+from .candidate import Candidate, InMemoryCandidate
 
-__all__ = [
-    "Candidate",
-    "KnowledgeBase",
-    "InMemoryLookupKB",
-    "get_candidates",
-    "get_candidates_batch",
-]
+
+__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd
index 80fcbc45940..f21f423e496 100644
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@@ -1,15 +1,15 @@
 from libcpp.vector cimport vector
-
+from .kb_in_memory cimport InMemoryLookupKB
 from ..typedefs cimport hash_t
-from .kb cimport KnowledgeBase
-
 
-# Object used by the Entity Linker that summarizes one entity-alias candidate
-# combination.
 cdef class Candidate:
-    cdef readonly KnowledgeBase kb
-    cdef hash_t entity_hash
-    cdef float entity_freq
-    cdef vector[float] entity_vector
-    cdef hash_t alias_hash
-    cdef float prior_prob
+    pass
+
+
+cdef class InMemoryCandidate(Candidate):
+    cdef readonly hash_t _entity_hash
+    cdef readonly hash_t _alias_hash
+    cpdef vector[float] _entity_vector
+    cdef float _prior_prob
+    cdef readonly InMemoryLookupKB _kb
+    cdef float _entity_freq
diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx
index 4369676e23a..bf66ccfae67 100644
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@@ -1,90 +1,97 @@
 # cython: infer_types=True
 
-from typing import Iterable
-
-from .kb cimport KnowledgeBase
-
-from ..tokens import Span
+from .kb_in_memory cimport InMemoryLookupKB
+from ..errors import Errors
 
 
 cdef class Candidate:
-    """A `Candidate` object refers to a textual mention (`alias`) that may or
-    may not be resolved to a specific `entity` from a Knowledge Base. This
-    will be used as input for the entity linking algorithm which will
-    disambiguate the various candidates to the correct one.
-    Each candidate (alias, entity) pair is assigned a certain prior probability.
+    """A `Candidate` object refers to a textual mention that may or may not be resolved
+    to a specific entity from a Knowledge Base. This will be used as input for the entity linking
+    algorithm which will disambiguate the various candidates to the correct one.
+    Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
+    is assigned a certain prior probability.
 
     DOCS: https://spacy.io/api/kb/#candidate-init
     """
 
-    def __init__(
-        self,
-        KnowledgeBase kb,
-        entity_hash,
-        entity_freq,
-        entity_vector,
-        alias_hash,
-        prior_prob
-    ):
-        self.kb = kb
-        self.entity_hash = entity_hash
-        self.entity_freq = entity_freq
-        self.entity_vector = entity_vector
-        self.alias_hash = alias_hash
-        self.prior_prob = prior_prob
+    def __init__(self):
+        # Make sure abstract Candidate is not instantiated.
+        if self.__class__ == Candidate:
+            raise TypeError(
+                Errors.E1046.format(cls_name=self.__class__.__name__)
+            )
 
     @property
-    def entity(self) -> int:
-        """RETURNS (uint64): hash of the entity's KB ID/name"""
-        return self.entity_hash
+    def entity_id(self) -> int:
+        """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
+        otherwise the hash of the entity ID string)."""
+        raise NotImplementedError
 
     @property
-    def entity_(self) -> str:
-        """RETURNS (str): ID/name of this entity in the KB"""
-        return self.kb.vocab.strings[self.entity_hash]
+    def entity_id_(self) -> str:
+        """RETURNS (str): String representation of entity ID."""
+        raise NotImplementedError
 
     @property
-    def alias(self) -> int:
-        """RETURNS (uint64): hash of the alias"""
-        return self.alias_hash
+    def entity_vector(self) -> vector[float]:
+        """RETURNS (vector[float]): Entity vector."""
+        raise NotImplementedError
 
-    @property
-    def alias_(self) -> str:
-        """RETURNS (str): ID of the original alias"""
-        return self.kb.vocab.strings[self.alias_hash]
+
+cdef class InMemoryCandidate(Candidate):
+    """Candidate for InMemoryLookupKB."""
+
+    def __init__(
+        self,
+        kb: InMemoryLookupKB,
+        entity_hash: int,
+        alias_hash: int,
+        entity_vector: vector[float],
+        prior_prob: float,
+        entity_freq: float
+    ):
+        """
+        kb (InMemoryLookupKB]): InMemoryLookupKB instance.
+        entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
+        entity_freq (int): Entity frequency in KB corpus.
+        entity_vector (List[float]): Entity embedding.
+        alias_hash (int): Alias hash.
+        prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
+            the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
+        """
+        super().__init__()
+
+        self._entity_hash = entity_hash
+        self._entity_vector = entity_vector
+        self._prior_prob = prior_prob
+        self._kb = kb
+        self._alias_hash = alias_hash
+        self._entity_freq = entity_freq
 
     @property
-    def entity_freq(self) -> float:
-        return self.entity_freq
+    def entity_id(self) -> int:
+        return self._entity_hash
 
     @property
-    def entity_vector(self) -> Iterable[float]:
-        return self.entity_vector
+    def entity_vector(self) -> vector[float]:
+        return self._entity_vector
 
     @property
     def prior_prob(self) -> float:
-        return self.prior_prob
-
+        """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
+        this entity."""
+        return self._prior_prob
 
-def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
-    """
-    Return candidate entities for a given mention and fetching appropriate
-    entries from the index.
-    kb (KnowledgeBase): Knowledge base to query.
-    mention (Span): Entity mention for which to identify candidates.
-    RETURNS (Iterable[Candidate]): Identified candidates.
-    """
-    return kb.get_candidates(mention)
+    @property
+    def alias(self) -> str:
+        """RETURNS (str): Alias."""
+        return self._kb.vocab.strings[self._alias_hash]
 
+    @property
+    def entity_id_(self) -> str:
+        return self._kb.vocab.strings[self._entity_hash]
 
-def get_candidates_batch(
-        kb: KnowledgeBase, mentions: Iterable[Span]
-) -> Iterable[Iterable[Candidate]]:
-    """
-    Return candidate entities for the given mentions and fetching appropriate entries
-    from the index.
-    kb (KnowledgeBase): Knowledge base to query.
-    mention (Iterable[Span]): Entity mentions for which to identify candidates.
-    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
-    """
-    return kb.get_candidates_batch(mentions)
+    @property
+    def entity_freq(self) -> float:
+        """RETURNS (float): Entity frequency in KB corpus."""
+        return self._entity_freq
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index c7db34e166a..24cee30ffc7 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -36,10 +36,10 @@ cdef class KnowledgeBase:
         self, mentions: Iterable[Span]
     ) -> Iterable[Iterable[Candidate]]:
         """
-        Return candidate entities for specified texts. Each candidate defines
-        the entity, the original alias, and the prior probability of that
-        alias resolving to that entity.
-        If no candidate is found for a given text, an empty list is returned.
+        Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
+        entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
+        probability of the specified mention text resolving to that entity - might be included.
+        If no candidates are found for a given mention, an empty list is returned.
         mentions (Iterable[Span]): Mentions for which to get candidates.
         RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
         """
@@ -47,10 +47,10 @@ cdef class KnowledgeBase:
 
     def get_candidates(self, mention: Span) -> Iterable[Candidate]:
         """
-        Return candidate entities for specified text. Each candidate defines
-        the entity, the original alias,
-        and the prior probability of that alias resolving to that entity.
-        If the no candidate is found for a given text, an empty list is returned.
+        Return candidate entities for a specific mention. Each candidate defines at least the entity and the
+        entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
+        probability of the specified mention text resolving to that entity - might be included.
+        If no candidate is found for the given mention, an empty list is returned.
         mention (Span): Mention for which to get candidates.
         RETURNS (Iterable[Candidate]): Identified candidates.
         """
@@ -128,3 +128,10 @@ cdef class KnowledgeBase:
                 parent="KnowledgeBase", method="from_disk", name=self.__name__
             )
         )
+
+    @property
+    def supports_prior_probs(self) -> bool:
+        """RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
+        )
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index 2b21f246a54..3aab0d73e72 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -22,8 +22,7 @@ from ..util import SimpleFrozenList, ensure_path
 
 from ..vocab cimport Vocab
 from .kb cimport KnowledgeBase
-
-from .candidate import Candidate as Candidate
+from .candidate import InMemoryCandidate
 
 
 cdef class InMemoryLookupKB(KnowledgeBase):
@@ -255,10 +254,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
             alias_entry.probs = probs
             self._aliases_table[alias_index] = alias_entry
 
-    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
-        return self.get_alias_candidates(mention.text)  # type: ignore
+    def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
+        return self._get_alias_candidates(mention.text)  # type: ignore
 
-    def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
+    def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
         """
         Return candidate entities for an alias. Each candidate defines the
         entity, the original alias, and the prior probability of that alias
@@ -271,18 +270,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
 
-        return [Candidate(kb=self,
-                          entity_hash=self._entries[entry_index].entity_hash,
-                          entity_freq=self._entries[entry_index].freq,
-                          entity_vector=self._vectors_table[
-                              self._entries[entry_index].vector_index
-                          ],
-                          alias_hash=alias_hash,
-                          prior_prob=prior_prob)
-                for (entry_index, prior_prob) in zip(
-                    alias_entry.entry_indices, alias_entry.probs
-                )
-                if entry_index != 0]
+        return [
+            InMemoryCandidate(
+                kb=self,
+                entity_hash=self._entries[entry_index].entity_hash,
+                alias_hash=alias_hash,
+                entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
+                prior_prob=prior_prob,
+                entity_freq=self._entries[entry_index].freq
+            )
+            for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
+            if entry_index != 0
+        ]
 
     def get_vector(self, str entity):
         cdef hash_t entity_hash = self.vocab.strings[entity]
@@ -316,6 +315,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 
         return 0.0
 
+    def supports_prior_probs(self) -> bool:
+        return True
+
     def to_bytes(self, **kwargs):
         """Serialize the current state to a binary string.
         """
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index b7100c00a4b..99522c4617c 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -14,6 +14,12 @@
 )
 from thinc.types import Floats2d
 
+from ...util import registry
+from ...kb import KnowledgeBase, InMemoryLookupKB
+from ...kb import Candidate
+from ...vocab import Vocab
+from ...tokens import Span, Doc
+from ..extract_spans import extract_spans
 from ...errors import Errors
 from ...kb import (
     Candidate,
@@ -132,3 +138,25 @@ def create_candidates_batch() -> Callable[
     [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
 ]:
     return get_candidates_batch
+
+
+def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
+    """
+    Return candidate entities for a given mention and fetching appropriate entries from the index.
+    kb (KnowledgeBase): Knowledge base to query.
+    mention (Span): Entity mention for which to identify candidates.
+    RETURNS (Iterable[Candidate]): Identified candidates.
+    """
+    return kb.get_candidates(mention)
+
+
+def get_candidates_batch(
+    kb: KnowledgeBase, mentions: Iterable[Span]
+) -> Iterable[Iterable[Candidate]]:
+    """
+    Return candidate entities for the given mentions and fetching appropriate entries from the index.
+    kb (KnowledgeBase): Knowledge base to query.
+    mentions (Iterable[Span]): Entity mentions for which to identify candidates.
+    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
+    """
+    return kb.get_candidates_batch(mentions)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 546bd9f6e2a..410da58a46d 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,5 +1,5 @@
-from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
-from typing import cast
+import warnings
+from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast
 from numpy import dtype
 from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 from pathlib import Path
@@ -15,14 +15,13 @@
 from thinc.types import Floats2d
 
 from ..kb import KnowledgeBase, Candidate
-from ..ml import empty_kb
 from ..tokens import Doc, Span
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 from ..language import Language
 from ..vocab import Vocab
 from ..training import Example, validate_examples, validate_get_examples
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..util import SimpleFrozenList, registry
 from .. import util
 from ..errors import Errors
@@ -252,6 +251,8 @@ def __init__(
 
         if candidates_batch_size < 1:
             raise ValueError(Errors.E1044)
+        if self.incl_prior and not self.kb.supports_prior_probs:
+            warnings.warn(Warnings.W401)
 
     def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
         """Define the KB of this pipe by providing a function that will
@@ -524,17 +525,51 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                             entity_encodings = xp.asarray(
                                 [c.entity_vector for c in candidates]
                             )
-                            entity_norm = xp.linalg.norm(entity_encodings, axis=1)
-                            if len(entity_encodings) != len(prior_probs):
-                                raise RuntimeError(
-                                    Errors.E147.format(
-                                        method="predict",
-                                        msg="vectors not of equal length",
+                        elif len(candidates) == 1 and self.threshold is None:
+                            # shortcut for efficiency reasons: take the 1 candidate
+                            final_kb_ids.append(candidates[0].entity_id_)
+                            self._add_activations(
+                                doc_scores=doc_scores,
+                                doc_ents=doc_ents,
+                                scores=[1.0],
+                                ents=[candidates[0].entity_id],
+                            )
+                        else:
+                            random.shuffle(candidates)
+                            # set all prior probabilities to 0 if incl_prior=False
+                            if self.incl_prior and self.kb.supports_prior_probs:
+                                prior_probs = xp.asarray([c.prior_prob for c in candidates])  # type: ignore
+                            else:
+                                prior_probs = xp.asarray([0.0 for _ in candidates])
+                            scores = prior_probs
+                            # add in similarity from the context
+                            if self.incl_context:
+                                entity_encodings = xp.asarray(
+                                    [c.entity_vector for c in candidates]
+                                )
+                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+                                if len(entity_encodings) != len(prior_probs):
+                                    raise RuntimeError(
+                                        Errors.E147.format(
+                                            method="predict",
+                                            msg="vectors not of equal length",
+                                        )
                                     )
                                 )
-                            # cosine similarity
-                            sims = xp.dot(entity_encodings, sentence_encoding_t) / (
-                                sentence_norm * entity_norm
+                                if sims.shape != prior_probs.shape:
+                                    raise ValueError(Errors.E161)
+                                scores = prior_probs + sims - (prior_probs * sims)
+                            final_kb_ids.append(
+                                candidates[scores.argmax().item()].entity_id_
+                                if self.threshold is None
+                                or scores.max() >= self.threshold
+                                else EntityLinker.NIL
+                            )
+                            self._add_activations(
+                                doc_scores=doc_scores,
+                                doc_ents=doc_ents,
+                                scores=scores,
+                                ents=[c.entity_id for c in candidates],
                             )
                             if sims.shape != prior_probs.shape:
                                 raise ValueError(Errors.E161)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index f28a4c9d5b9..4ba7cc1a16d 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -7,9 +7,10 @@
 from spacy import Language, registry, util
 from spacy.attrs import ENT_KB_ID
 from spacy.compat import pickle
-from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates
+from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
 from spacy.lang.en import English
 from spacy.ml import load_kb
+from spacy.ml.models.entity_linker import build_span_maker, get_candidates
 from spacy.pipeline import EntityLinker, TrainablePipe
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
@@ -452,16 +453,17 @@ def test_candidate_generation(nlp):
     mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
     # test the size of the relevant candidates
+    adam_ent_cands = get_candidates(mykb, adam_ent)
     assert len(get_candidates(mykb, douglas_ent)) == 2
-    assert len(get_candidates(mykb, adam_ent)) == 1
+    assert len(adam_ent_cands) == 1
     assert len(get_candidates(mykb, Adam_ent)) == 0  # default case sensitive
     assert len(get_candidates(mykb, shrubbery_ent)) == 0
 
     # test the content of the candidates
-    assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2"
-    assert get_candidates(mykb, adam_ent)[0].alias_ == "adam"
-    assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12)
-    assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
+    assert adam_ent_cands[0].entity_id_ == "Q2"
+    assert adam_ent_cands[0].alias == "adam"
+    assert_almost_equal(adam_ent_cands[0].entity_freq, 12)
+    assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9)
 
 
 def test_el_pipe_configuration(nlp):
@@ -489,7 +491,7 @@ def create_kb(vocab):
     assert doc[2].ent_kb_id_ == "Q2"
 
     def get_lowercased_candidates(kb, span):
-        return kb.get_alias_candidates(span.text.lower())
+        return kb._get_alias_candidates(span.text.lower())
 
     def get_lowercased_candidates_batch(kb, spans):
         return [get_lowercased_candidates(kb, span) for span in spans]
@@ -548,24 +550,22 @@ def test_vocab_serialization(nlp):
     mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
     adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
-    candidates = mykb.get_alias_candidates("adam")
+    candidates = mykb._get_alias_candidates("adam")
     assert len(candidates) == 1
-    assert candidates[0].entity == q2_hash
-    assert candidates[0].entity_ == "Q2"
-    assert candidates[0].alias == adam_hash
-    assert candidates[0].alias_ == "adam"
+    assert candidates[0].entity_id == q2_hash
+    assert candidates[0].entity_id_ == "Q2"
+    assert candidates[0].alias == "adam"
 
     with make_tempdir() as d:
         mykb.to_disk(d / "kb")
         kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
         kb_new_vocab.from_disk(d / "kb")
 
-        candidates = kb_new_vocab.get_alias_candidates("adam")
+        candidates = kb_new_vocab._get_alias_candidates("adam")
         assert len(candidates) == 1
-        assert candidates[0].entity == q2_hash
-        assert candidates[0].entity_ == "Q2"
-        assert candidates[0].alias == adam_hash
-        assert candidates[0].alias_ == "adam"
+        assert candidates[0].entity_id == q2_hash
+        assert candidates[0].entity_id_ == "Q2"
+        assert candidates[0].alias == "adam"
 
         assert kb_new_vocab.get_vector("Q2") == [2]
         assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
@@ -585,20 +585,20 @@ def test_append_alias(nlp):
     mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
     # test the size of the relevant candidates
-    assert len(mykb.get_alias_candidates("douglas")) == 2
+    assert len(mykb._get_alias_candidates("douglas")) == 2
 
     # append an alias
     mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
 
     # test the size of the relevant candidates has been incremented
-    assert len(mykb.get_alias_candidates("douglas")) == 3
+    assert len(mykb._get_alias_candidates("douglas")) == 3
 
     # append the same alias-entity pair again should not work (will throw a warning)
     with pytest.warns(UserWarning):
         mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
 
     # test the size of the relevant candidates remained unchanged
-    assert len(mykb.get_alias_candidates("douglas")) == 3
+    assert len(mykb._get_alias_candidates("douglas")) == 3
 
 
 @pytest.mark.filterwarnings("ignore:\\[W036")
@@ -895,11 +895,11 @@ def test_kb_to_bytes():
     assert kb_2.contains_alias("Russ Cochran")
     assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
     assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
-    assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
-        kb_2.get_alias_candidates("Russ Cochran")
+    assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
+        kb_2._get_alias_candidates("Russ Cochran")
     )
-    assert len(kb_1.get_alias_candidates("Randomness")) == len(
-        kb_2.get_alias_candidates("Randomness")
+    assert len(kb_1._get_alias_candidates("Randomness")) == len(
+        kb_2._get_alias_candidates("Randomness")
     )
 
 
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 99eb8cd8694..b6bad3c46ee 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -66,19 +66,21 @@ def _check_kb(kb):
         assert alias_string not in kb.get_alias_strings()
 
     # check candidates & probabilities
-    candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
+    candidates = sorted(
+        kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_
+    )
     assert len(candidates) == 2
 
-    assert candidates[0].entity_ == "Q007"
+    assert candidates[0].entity_id_ == "Q007"
     assert 6.999 < candidates[0].entity_freq < 7.01
     assert candidates[0].entity_vector == [0, 0, 7]
-    assert candidates[0].alias_ == "double07"
+    assert candidates[0].alias == "double07"
     assert 0.899 < candidates[0].prior_prob < 0.901
 
-    assert candidates[1].entity_ == "Q17"
+    assert candidates[1].entity_id_ == "Q17"
     assert 1.99 < candidates[1].entity_freq < 2.01
     assert candidates[1].entity_vector == [7, 1, 0]
-    assert candidates[1].alias_ == "double07"
+    assert candidates[1].alias == "double07"
     assert 0.099 < candidates[1].prior_prob < 0.101
 
 
diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx
index 15b1d3bf29c..4621d883810 100644
--- a/website/docs/api/inmemorylookupkb.mdx
+++ b/website/docs/api/inmemorylookupkb.mdx
@@ -10,9 +10,9 @@ version: 3.5
 
 The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
 implements all of its methods. It stores all KB data in-memory and generates
-[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with
-entity names. It's highly optimized for both a low memory footprint and speed of
-retrieval.
+[`InMemoryCandidate`](/api/kb#candidate) objects by exactly matching mentions
+with entity names. It's highly optimized for both a low memory footprint and
+speed of retrieval.
 
 ## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"}
 
@@ -156,7 +156,7 @@ Get a list of all aliases in the knowledge base.
 ## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"}
 
 Given a certain textual mention as input, retrieve a list of candidate entities
-of type [`Candidate`](/api/kb#candidate). Wraps
+of type [`InMemoryCandidate`](/api/kb#candidate). Wraps
 [`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
 
 > #### Example
@@ -168,10 +168,10 @@ of type [`Candidate`](/api/kb#candidate). Wraps
 > candidates = kb.get_candidates(doc[0:2])
 > ```
 
-| Name        | Description                                                          |
-| ----------- | -------------------------------------------------------------------- |
-| `mention`   | The textual mention or alias. ~~Span~~                               |
-| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
+| Name        | Description                                                                          |
+| ----------- | ------------------------------------------------------------------------------------ |
+| `mention`   | The textual mention or alias. ~~Span~~                                               |
+| **RETURNS** | An iterable of relevant `InMemoryCandidate` objects. ~~Iterable[InMemoryCandidate]~~ |
 
 ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
 
@@ -194,26 +194,10 @@ to you.
 > candidates = kb.get_candidates((doc[0:2], doc[3:]))
 > ```
 
-| Name        | Description                                                                                  |
-| ----------- | -------------------------------------------------------------------------------------------- |
-| `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
-| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
-
-## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"}
-
-Given a certain textual mention as input, retrieve a list of candidate entities
-of type [`Candidate`](/api/kb#candidate).
-
-> #### Example
->
-> ```python
-> candidates = kb.get_alias_candidates("Douglas")
-> ```
-
-| Name        | Description                                                   |
-| ----------- | ------------------------------------------------------------- |
-| `alias`     | The textual mention or alias. ~~str~~                         |
-| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
+| Name        | Description                                                                                                  |
+| ----------- | ------------------------------------------------------------------------------------------------------------ |
+| `mentions`  | The textual mentions. ~~Iterable[Span]~~                                                                     |
+| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
 
 ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
 
diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx
index 2b0d4d9d6b3..9536a3fe375 100644
--- a/website/docs/api/kb.mdx
+++ b/website/docs/api/kb.mdx
@@ -103,23 +103,6 @@ to you.
 | `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
 | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
 
-## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"}
-
-<Infobox variant="warning">
-  This method is _not_ available from spaCy 3.5 onwards.
-</Infobox>
-
-From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
-[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
-allow more flexibility in customizing knowledge bases. Some of its methods were
-moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
-one of those being `get_alias_candidates()`. This method is now available as
-[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
-Note:
-[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
-defaults to
-[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
-
 ## KnowledgeBase.get_vector {id="get_vector",tag="method"}
 
 Given a certain entity ID, retrieve its pretrained entity vector.
@@ -190,25 +173,27 @@ Restore the state of the knowledge base from a given directory. Note that the
 | `exclude`   | List of components to exclude. ~~Iterable[str]~~                                                |
 | **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~                                          |
 
-## Candidate {id="candidate",tag="class"}
+## InMemoryCandidate {id="candidate",tag="class"}
 
-A `Candidate` object refers to a textual mention (alias) that may or may not be
-resolved to a specific entity from a `KnowledgeBase`. This will be used as input
-for the entity linking algorithm which will disambiguate the various candidates
-to the correct one. Each candidate `(alias, entity)` pair is assigned to a
-certain prior probability.
+An `InMemoryCandidate` object refers to a textual mention (alias) that may or
+may not be resolved to a specific entity from a `KnowledgeBase`. This will be
+used as input for the entity linking algorithm which will disambiguate the
+various candidates to the correct one. Each candidate `(alias, entity)` pair is
+assigned to a certain prior probability.
 
-### Candidate.\_\_init\_\_ {id="candidate-init",tag="method"}
+### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
 
-Construct a `Candidate` object. Usually this constructor is not called directly,
-but instead these objects are returned by the `get_candidates` method of the
-[`entity_linker`](/api/entitylinker) pipe.
+Construct an `InMemoryCandidate` object. Usually this constructor is not called
+directly, but instead these objects are returned by the `get_candidates` method
+of the [`entity_linker`](/api/entitylinker) pipe.
 
-> #### Example
+> #### Example```python
+>
+> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
+> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
+>
+> ```
 >
-> ```python
-> from spacy.kb import Candidate
-> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
 > ```
 
 | Name          | Description                                                               |
@@ -216,10 +201,10 @@ but instead these objects are returned by the `get_candidates` method of the
 | `kb`          | The knowledge base that defined this candidate. ~~KnowledgeBase~~         |
 | `entity_hash` | The hash of the entity's KB ID. ~~int~~                                   |
 | `entity_freq` | The entity frequency as recorded in the KB. ~~float~~                     |
-| `alias_hash`  | The hash of the textual mention or alias. ~~int~~                         |
+| `alias_hash`  | The hash of the entity alias. ~~int~~                                     |
 | `prior_prob`  | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
 
-## Candidate attributes {id="candidate-attributes"}
+## InMemoryCandidate attributes {id="candidate-attributes"}
 
 | Name            | Description                                                              |
 | --------------- | ------------------------------------------------------------------------ |

From 830dd88a20f4085a7920f9fc65cee2bc7928deb6 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 20 Mar 2023 12:25:18 +0100
Subject: [PATCH 177/504] Entity linking: use `SpanGroup` instead of
 `Iterable[Span]` for mentions (#12344)

* Convert Candidate from Cython to Python class.

* Format.

* Fix .entity_ typo in _add_activations() usage.

* Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span].

* Update docs.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update doc string of BaseCandidate.__init__().

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate.

* Adjust Candidate to support and mandate numerical entity IDs.

* Format.

* Fix docstring and docs.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename alias -> mention.

* Refactor Candidate attribute names. Update docs and tests accordingly.

* Refacor Candidate attributes and their usage.

* Format.

* Fix mypy error.

* Update error code in line with v4 convention.

* Reverse erroneous changes during merge.

* Update return type in EL tests.

* Re-add Candidate to setup.py.

* Format updated docs.

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/kb/__init__.py                       |  1 -
 spacy/kb/kb.pyx                            | 10 ++---
 spacy/ml/models/entity_linker.py           |  8 ++--
 spacy/pipeline/entity_linker.py            | 45 ++++++++++------------
 spacy/tests/pipeline/test_entity_linker.py |  1 -
 website/docs/api/inmemorylookupkb.mdx      |  5 ++-
 website/docs/api/kb.mdx                    | 11 +++---
 7 files changed, 37 insertions(+), 44 deletions(-)

diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
index fb21083ddee..7155c15df9a 100644
--- a/spacy/kb/__init__.py
+++ b/spacy/kb/__init__.py
@@ -3,5 +3,4 @@
 from .kb_in_memory import InMemoryLookupKB
 from .candidate import Candidate, InMemoryCandidate
 
-
 __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index 24cee30ffc7..bb58bf88a46 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -5,8 +5,8 @@ from typing import Iterable, Tuple, Union
 
 from cymem.cymem cimport Pool
 
-from ..errors import Errors
-from ..tokens import Span
+from .candidate import Candidate
+from ..tokens import Span, SpanGroup
 from ..util import SimpleFrozenList
 from .candidate import Candidate
 
@@ -32,15 +32,13 @@ cdef class KnowledgeBase:
         self.entity_vector_length = entity_vector_length
         self.mem = Pool()
 
-    def get_candidates_batch(
-        self, mentions: Iterable[Span]
-    ) -> Iterable[Iterable[Candidate]]:
+    def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]:
         """
         Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
         entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
         probability of the specified mention text resolving to that entity - might be included.
         If no candidates are found for a given mention, an empty list is returned.
-        mentions (Iterable[Span]): Mentions for which to get candidates.
+        mentions (SpanGroup): Mentions for which to get candidates.
         RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
         """
         return [self.get_candidates(span) for span in mentions]
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 99522c4617c..db960fbd0a9 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -18,7 +18,7 @@
 from ...kb import KnowledgeBase, InMemoryLookupKB
 from ...kb import Candidate
 from ...vocab import Vocab
-from ...tokens import Span, Doc
+from ...tokens import Doc, Span, SpanGroup
 from ..extract_spans import extract_spans
 from ...errors import Errors
 from ...kb import (
@@ -135,7 +135,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
 
 @registry.misc("spacy.CandidateBatchGenerator.v1")
 def create_candidates_batch() -> Callable[
-    [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+    [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
 ]:
     return get_candidates_batch
 
@@ -151,12 +151,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
 
 
 def get_candidates_batch(
-    kb: KnowledgeBase, mentions: Iterable[Span]
+    kb: KnowledgeBase, mentions: SpanGroup
 ) -> Iterable[Iterable[Candidate]]:
     """
     Return candidate entities for the given mentions and fetching appropriate entries from the index.
     kb (KnowledgeBase): Knowledge base to query.
-    mentions (Iterable[Span]): Entity mentions for which to identify candidates.
+    mentions (SpanGroup): Entity mentions for which to identify candidates.
     RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
     """
     return kb.get_candidates_batch(mentions)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 410da58a46d..4882ead1d92 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -16,6 +16,8 @@
 
 from ..kb import KnowledgeBase, Candidate
 from ..tokens import Doc, Span
+from ..ml import empty_kb
+from ..tokens import Doc, Span, SpanGroup
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 from ..language import Language
@@ -97,7 +99,7 @@ def make_entity_linker(
     entity_vector_length: int,
     get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
     get_candidates_batch: Callable[
-        [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+        [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
     ],
     generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
     overwrite: bool,
@@ -120,7 +122,7 @@ def make_entity_linker(
     get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
         produces a list of candidates, given a certain knowledge base and a textual mention.
     get_candidates_batch (
-        Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
+        Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
         ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
     generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
     scorer (Optional[Callable]): The scoring method.
@@ -185,7 +187,7 @@ def __init__(
         entity_vector_length: int,
         get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
         get_candidates_batch: Callable[
-            [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+            [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
         ],
         overwrite: bool = False,
         scorer: Optional[Callable] = entity_linker_score,
@@ -208,7 +210,7 @@ def __init__(
         get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
             produces a list of candidates, given a certain knowledge base and a textual mention.
         get_candidates_batch (
-            Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
+            Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]],
             Iterable[Candidate]]
             ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
         overwrite (bool): Whether to overwrite existing non-empty annotations.
@@ -472,26 +474,21 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                 continue
             sentences = [s for s in doc.sents]
 
-                if self.incl_context:
-                    # get n_neighbour sentences, clipped to the length of the document
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    sent_doc = doc[start_token:end_token].as_doc()
-                    # currently, the context is the same for each entity in a sentence (should be refined)
-                    sentence_encoding = self.model.predict([sent_doc])[0]
-                    sentence_encoding_t = sentence_encoding.T
-                    sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                entity_count += 1
-                if ent.label_ in self.labels_discard:
-                    # ignoring this entity - setting to NIL
-                    final_kb_ids.append(self.NIL)
-                    self._add_activations(
-                        doc_scores=doc_scores,
-                        doc_ents=doc_ents,
-                        scores=[0.0],
-                        ents=[0],
+            # Loop over entities in batches.
+            for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
+                ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
+
+                # Look up candidate entities.
+                valid_ent_idx = [
+                    idx
+                    for idx in range(len(ent_batch))
+                    if ent_batch[idx].label_ not in self.labels_discard
+                ]
+
+                batch_candidates = list(
+                    self.get_candidates_batch(
+                        self.kb,
+                        SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
                     )
                 else:
                     candidates = list(self.get_candidates(self.kb, ent))
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 4ba7cc1a16d..170f2215f83 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -985,7 +985,6 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
-
     # Ensure that the legacy architectures still work
     vector_length = 3
     nlp = English()
diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx
index 4621d883810..712cce30747 100644
--- a/website/docs/api/inmemorylookupkb.mdx
+++ b/website/docs/api/inmemorylookupkb.mdx
@@ -189,14 +189,15 @@ to you.
 >
 > ```python
 > from spacy.lang.en import English
+> from spacy.tokens import SpanGroup
 > nlp = English()
 > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
-> candidates = kb.get_candidates((doc[0:2], doc[3:]))
+> candidates = kb.get_candidates_batch([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
 > ```
 
 | Name        | Description                                                                                                  |
 | ----------- | ------------------------------------------------------------------------------------------------------------ |
-| `mentions`  | The textual mentions. ~~Iterable[Span]~~                                                                     |
+| `mentions`  | The textual mentions. ~~SpanGroup~~                                                                          |
 | **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
 
 ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx
index 9536a3fe375..94506162f27 100644
--- a/website/docs/api/kb.mdx
+++ b/website/docs/api/kb.mdx
@@ -93,14 +93,15 @@ to you.
 >
 > ```python
 > from spacy.lang.en import English
+> from spacy.tokens import SpanGroup
 > nlp = English()
 > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
-> candidates = kb.get_candidates((doc[0:2], doc[3:]))
+> candidates = kb.get_candidates([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
 > ```
 
 | Name        | Description                                                                                  |
 | ----------- | -------------------------------------------------------------------------------------------- |
-| `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
+| `mentions`  | The textual mentions. ~~SpanGroup~~                                                          |
 | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
 
 ## KnowledgeBase.get_vector {id="get_vector",tag="method"}
@@ -187,13 +188,11 @@ Construct an `InMemoryCandidate` object. Usually this constructor is not called
 directly, but instead these objects are returned by the `get_candidates` method
 of the [`entity_linker`](/api/entitylinker) pipe.
 
-> #### Example```python
+> #### Example
 >
+> ```python
 > from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
 > entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
->
-> ```
->
 > ```
 
 | Name          | Description                                                               |

From 0b2395adf5a8c35aa03151955f4d4dc341c9a5d4 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 27 Mar 2023 09:18:23 +0200
Subject: [PATCH 178/504] Add info that Vocab and StringStore are not static in
 docs (#12427)

* Add size increase info about vocab and stringstore

* Update website/docs/api/stringstore.mdx

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>

* Update website/docs/api/vocab.mdx

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>

* Change wording

---------

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 website/docs/api/stringstore.mdx | 2 +-
 website/docs/api/vocab.mdx       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index 269ac2d0c4b..1b1f3bd5352 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -8,7 +8,7 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
 integer IDs. This ensures that strings always map to the same ID, even from
 different `StringStores`.
 
-<Infobox variant ="warning">
+<Infobox variant="warning">
 
 Note that a `StringStore` instance is not static. It increases in size as texts
 with new tokens are processed.
diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx
index 88d3939142f..319ce88b8dc 100644
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@@ -10,7 +10,7 @@ The `Vocab` object provides a lookup table that allows you to access
 [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
 between `Doc` objects.
 
-<Infobox variant ="warning">
+<Infobox variant="warning">
 
 Note that a `Vocab` instance is not static. It increases in size as texts with
 new tokens are processed. Some models may have an empty vocab at initialization.

From 1dad3ecbed201733f99a73e2bc44e8b83c4c80cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 30 Mar 2023 09:30:42 +0200
Subject: [PATCH 179/504] Avoid `TrainablePipe.finish_update` getting called
 twice during training (#12450)

* Avoid `TrainablePipe.finish_update` getting called twice during training

PR #12136 fixed an issue where the tok2vec pipe was updated before
gradient were accumulated. However, it introduced a new bug that cause
`finish_update` to be called twice when using the training loop. This
causes a fairly large slowdown.

The `Language.update` method accepts the `sgd` argument for passing an
optimizer. This argument has three possible values:

- `Optimizer`: use the given optimizer to finish pipe updates.
- `None`: use a default optimizer to finish pipe updates.
- `False`: do not finish pipe updates.

However, the latter option was not documented and not valid with the
existing type of `sgd`. I assumed that this was a remnant of earlier
spaCy versions and removed handling of `False`.

However, with that change, we are passing `None` to `Language.update`.
As a result, we were calling `finish_update` in both `Language.update`
and in the training loop after all subbatches are processed.

This change restores proper handling/use of `False`. Moreover, the role
of `False` is now documented and added to the type to avoid future
accidents.

* Fix typo

* Document defaults for `Language.update`
---
 spacy/language.py             |  7 +++++--
 spacy/tests/test_language.py  | 18 ++++++++++++++++++
 spacy/training/loop.py        |  2 +-
 website/docs/api/language.mdx | 18 +++++++++---------
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index e8a7d719ef2..b8c4322d3b4 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1230,7 +1230,7 @@ def update(
         _: Optional[Any] = None,
         *,
         drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
+        sgd: Union[Optimizer, None, Literal[False]] = None,
         losses: Optional[Dict[str, float]] = None,
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
         exclude: Iterable[str] = SimpleFrozenList(),
@@ -1241,7 +1241,9 @@ def update(
         examples (Iterable[Example]): A batch of examples
         _: Should not be set - serves to catch backwards-incompatible scripts.
         drop (float): The dropout rate.
-        sgd (Optimizer): An optimizer.
+        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
+            be created via create_optimizer if 'None'. No optimizer will
+            be used when set to 'False'.
         losses (Dict[str, float]): Dictionary to update with the loss, keyed by
             component.
         component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
@@ -1300,6 +1302,7 @@ def update(
                 name not in exclude
                 and isinstance(proc, ty.TrainableComponent)
                 and proc.is_trainable
+                and sgd not in (None, False)
             ):
                 proc.finish_update(sgd)
 
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 88ef3d434c0..e4b06893c93 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -162,6 +162,24 @@ def test_language_update_updates():
     )
 
 
+def test_language_update_does_not_update_with_sgd_false():
+    config = Config().from_str(TAGGER_CFG_STRING)
+    nlp = load_model_from_config(config, auto_fill=True, validate=True)
+
+    train_examples = []
+    for t in TAGGER_TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+    nlp.update(train_examples, sgd=False)
+    docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+
+    xp = get_array_module(docs_after_update[0].tensor)
+    xp.testing.assert_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
+
+
 def test_language_evaluate(nlp):
     text = "hello world"
     annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index e6b3451cd73..9497b95aba5 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -221,7 +221,7 @@ def train_while_improving(
                 subbatch,
                 drop=dropout,
                 losses=losses,
-                sgd=None,
+                sgd=False,
                 exclude=exclude,
                 annotates=annotating_components,
             )
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 2a1f7a1a961..e38e49bf569 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -323,15 +323,15 @@ and custom registered functions if needed. See the
 >     nlp.update([example], sgd=optimizer)
 > ```
 
-| Name            | Description                                                                                                                                    |
-| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`      | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                              |
-| _keyword-only_  |                                                                                                                                                |
-| `drop`          | The dropout rate. ~~float~~                                                                                                                    |
-| `sgd`           | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                  |
-| `losses`        | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                |
-| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
-| **RETURNS**     | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                          |
+| Name            | Description                                                                                                                                                                                        |
+| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`      | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                                                                                  |
+| _keyword-only_  |                                                                                                                                                                                                    |
+| `drop`          | The dropout rate. Defaults to `0.0`. ~~float~~                                                                                                                                                     |
+| `sgd`           | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ |
+| `losses`        | Dictionary to update with the loss, keyed by pipeline component. Defaults to `None`. ~~Optional[Dict[str, float]]~~                                                                                |
+| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                                     |
+| **RETURNS**     | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                              |
 
 ## Language.distill {id="distill",tag="method,experimental",version="4"}
 

From 1d11dc178442109258451b33027c4c7360febcf0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 6 Apr 2023 16:01:59 +0200
Subject: [PATCH 180/504] Enforce that Span.start/end(_char) remain valid and
 in sync (#12268)

* Enforce that Span.start/end(_char) remain valid and in sync

Allowing span attributes to be writable starting in v3 has made it
possible for the internal `Span.start/end/start_char/end_char` to get
out-of-sync or have invalid values.

This checks that the values are valid and syncs the token and char
offsets if any attributes are modified directly. It does not yet handle
the case where the underlying doc is modified.

* Format
---
 spacy/errors.py              |  5 +++-
 spacy/tests/doc/test_span.py | 47 ++++++++++++++++++++++++++++++++++
 spacy/tokens/span.pyx        | 49 +++++++++++++++++++++++++++---------
 3 files changed, 88 insertions(+), 13 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 42fdc12e029..fe067f7915d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -932,7 +932,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E1029 = ("Edit tree cannot be applied to form.")
     E1030 = ("Edit tree identifier out of range.")
     E1031 = ("Could not find gold transition - see logs above.")
-    E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
+    E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.")
     E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
     E1034 = ("Node index {i} out of bounds ({length})")
     E1035 = ("Token index {i} out of bounds ({length})")
@@ -986,6 +986,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E4004 = ("Backprop is not supported when is_train is not set.")
     E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
     E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
+    E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
+             "{existing_value}.")
+    E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
 
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 74874624888..0b05ca7c123 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -696,3 +696,50 @@ def test_span_ent_id(en_tokenizer):
     doc.ents = [span]
     assert doc.ents[0].ent_id_ == "ID2"
     assert doc[1].ent_id_ == "ID2"
+
+
+def test_span_start_end_sync(en_tokenizer):
+    doc = en_tokenizer("a bc def e fghij kl")
+    # can create and edit span starts/ends
+    span = doc[2:4]
+    span.start_char = 2
+    span.end = 5
+    assert span == doc[span.start : span.end]
+    assert span == doc.char_span(span.start_char, span.end_char)
+    # cannot set completely out of bounds starts/ends
+    with pytest.raises(IndexError):
+        span.start = -1
+    with pytest.raises(IndexError):
+        span.end = -1
+    with pytest.raises(IndexError):
+        span.start_char = len(doc.text) + 1
+    with pytest.raises(IndexError):
+        span.end = len(doc.text) + 1
+    # test all possible char starts/ends
+    span = doc[0 : len(doc)]
+    token_char_starts = [token.idx for token in doc]
+    token_char_ends = [token.idx + len(token.text) for token in doc]
+    for i in range(len(doc.text)):
+        if i not in token_char_starts:
+            with pytest.raises(ValueError):
+                span.start_char = i
+        else:
+            span.start_char = i
+    span = doc[0 : len(doc)]
+    for i in range(len(doc.text)):
+        if i not in token_char_ends:
+            with pytest.raises(ValueError):
+                span.end_char = i
+        else:
+            span.end_char = i
+    # start must be <= end
+    span = doc[1:3]
+    with pytest.raises(ValueError):
+        span.start = 4
+    with pytest.raises(ValueError):
+        span.end = 0
+    span = doc.char_span(2, 8)
+    with pytest.raises(ValueError):
+        span.start_char = 9
+    with pytest.raises(ValueError):
+        span.end_char = 1
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 1378889c681..f51a1c5ee3e 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -800,36 +800,61 @@ cdef class Span:
             return self.span_c().start
 
         def __set__(self, int start):
-            if start < 0:
-                raise IndexError("TODO")
-            self.span_c().start = start
+            if start < 0 or start > self.doc.length:
+                raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
+            cdef SpanC* span_c = self.span_c()
+            if start > span_c.end:
+                raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
+            span_c.start = start
+            span_c.start_char = self.doc.c[start].idx
 
     property end:
         def __get__(self):
             return self.span_c().end
 
         def __set__(self, int end):
-            if end < 0:
-                raise IndexError("TODO")
-            self.span_c().end = end
+            if end < 0 or end > self.doc.length:
+                raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
+            cdef SpanC* span_c = self.span_c()
+            if span_c.start > end:
+                raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
+            span_c.end = end
+            if end > 0:
+                span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length
+            else:
+                span_c.end_char = 0
 
     property start_char:
         def __get__(self):
             return self.span_c().start_char
 
         def __set__(self, int start_char):
-            if start_char < 0:
-                raise IndexError("TODO")
-            self.span_c().start_char = start_char
+            if start_char < 0 or start_char > len(self.doc.text):
+                raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
+            cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
+            if start < 0:
+                raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
+            cdef SpanC* span_c = self.span_c()
+            if start_char > span_c.end_char:
+                raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char))
+            span_c.start_char = start_char
+            span_c.start = start
 
     property end_char:
         def __get__(self):
             return self.span_c().end_char
 
         def __set__(self, int end_char):
-            if end_char < 0:
-                raise IndexError("TODO")
-            self.span_c().end_char = end_char
+            if end_char < 0 or end_char > len(self.doc.text):
+                raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
+            cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
+            if end < 0:
+                raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
+            cdef SpanC* span_c = self.span_c()
+            if span_c.start_char > end_char:
+                raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char))
+            span_c.end_char = end_char
+            span_c.end = end
 
     property label:
         def __get__(self):

From 89a63dbaf86ba087e108c57ab1634e03247fbb34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 21 Apr 2023 13:49:40 +0200
Subject: [PATCH 181/504] Add distillation loop (#12542)

* Add distillation initialization and loop

* Fix up configuration keys

* Add docstring

* Type annotations

* init_nlp_distill -> init_nlp_student

* Do not resolve dot name distill corpus in initialization

(Since we don't use it.)

* student: do not request use of optimizer in student pipe

We apply finish up the updates once in the training loop instead.

Also add the necessary logic to `Language.distill` to mirror
`Language.update`.

* Correctly determine sort key in subdivide_batch

* Fix _distill_loop docstring wrt. stopping condition

* _distill_loop: fix distill_data docstring

Make similar changes in train_while_improving, since it also had
incorrect types and missing type annotations.

* Move `set_{gpu_allocator,seed}_from_config` to spacy.util

* Update Language.update docs for the sgd argument

* Type annotation

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/language.py                 |  26 ++-
 spacy/schemas.py                  |   2 +-
 spacy/tests/training/test_loop.py | 111 +++++++++++
 spacy/training/initialize.py      | 134 ++++++++++---
 spacy/training/loop.py            | 317 +++++++++++++++++++++++++++---
 spacy/util.py                     |  20 ++
 website/docs/api/language.mdx     |  26 +--
 7 files changed, 560 insertions(+), 76 deletions(-)
 create mode 100644 spacy/tests/training/test_loop.py

diff --git a/spacy/language.py b/spacy/language.py
index b8c4322d3b4..028f733200e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1052,7 +1052,7 @@ def distill(
         examples: Iterable[Example],
         *,
         drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
+        sgd: Union[Optimizer, None, Literal[False]] = None,
         losses: Optional[Dict[str, float]] = None,
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
         exclude: Iterable[str] = SimpleFrozenList(),
@@ -1065,7 +1065,9 @@ def distill(
             (teacher) and predicted (student) docs must have the same number of
             tokens and the same orthography.
         drop (float): The dropout rate.
-        sgd (Optional[Optimizer]): An optimizer.
+        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
+            be created via create_optimizer if 'None'. No optimizer will
+            be used when set to 'False'.
         losses (Optional(Dict[str, float])): Dictionary to update with the loss,
             keyed by component.
         component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
@@ -1135,11 +1137,23 @@ def distill(
                 student_proc.distill(
                     teacher_pipe,
                     examples,
-                    sgd=sgd,
+                    sgd=None,
                     losses=losses,
                     **component_cfg[student_name],
                 )
 
+        # Only finish the update after all component updates are done. Some
+        # components may share weights (such as tok2vec) and we only want
+        # to apply weight updates after all gradients are accumulated.
+        for student_name, student_proc in self.pipeline:
+            if (
+                student_name not in exclude
+                and isinstance(student_proc, ty.DistillableComponent)
+                and student_proc.is_distillable
+                and sgd not in (None, False)
+            ):
+                student_proc.finish_update(sgd)
+
         return losses
 
     def disable_pipes(self, *names) -> "DisabledPipes":
@@ -1908,7 +1922,7 @@ def from_config(
         # using the nlp.config with all defaults.
         config = util.copy_config(config)
         orig_pipeline = config.pop("components", {})
-        orig_distill = config.pop("distill", None)
+        orig_distill = config.pop("distillation", None)
         orig_pretraining = config.pop("pretraining", None)
         config["components"] = {}
         if auto_fill:
@@ -1918,8 +1932,8 @@ def from_config(
         filled["components"] = orig_pipeline
         config["components"] = orig_pipeline
         if orig_distill is not None:
-            filled["distill"] = orig_distill
-            config["distill"] = orig_distill
+            filled["distillation"] = orig_distill
+            config["distillation"] = orig_distill
         if orig_pretraining is not None:
             filled["pretraining"] = orig_pretraining
             config["pretraining"] = orig_pretraining
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 32fb042b5a0..7fc5ec20e51 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -510,7 +510,7 @@ class Config:
     "training": ConfigSchemaTraining,
     "pretraining": ConfigSchemaPretrain,
     "initialize": ConfigSchemaInit,
-    "distill": ConfigSchemaDistill,
+    "distillation": ConfigSchemaDistill,
 }
 
 # Recommendations for init config workflows
diff --git a/spacy/tests/training/test_loop.py b/spacy/tests/training/test_loop.py
new file mode 100644
index 00000000000..46d01509504
--- /dev/null
+++ b/spacy/tests/training/test_loop.py
@@ -0,0 +1,111 @@
+from typing import Callable, Iterable, Iterator
+import pytest
+from spacy import Language
+from spacy.training import Example
+from spacy.training.initialize import init_nlp_student
+from spacy.training.loop import distill, train
+from spacy.util import load_model_from_config, registry
+from thinc.api import Config
+
+
+@pytest.fixture
+def config_str():
+    return """
+    [nlp]
+    lang = "en"
+    pipeline = ["senter"]
+    disabled = []
+    before_creation = null
+    after_creation = null
+    after_pipeline_creation = null
+    batch_size = 1000
+    tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+    [components]
+
+    [components.senter]
+    factory = "senter"
+
+    [training]
+    dev_corpus = "corpora.dev"
+    train_corpus = "corpora.train"
+    max_steps = 50
+    seed = 1
+    gpu_allocator = null
+
+    [distillation]
+    corpus = "corpora.train"
+    dropout = 0.1
+    max_epochs = 0
+    max_steps = 50
+    student_to_teacher = {}
+
+    [distillation.batcher]
+    @batchers = "spacy.batch_by_words.v1"
+    size = 3000
+    discard_oversize = false
+    tolerance = 0.2
+
+    [distillation.optimizer]
+    @optimizers = "Adam.v1"
+    beta1 = 0.9
+    beta2 = 0.999
+    L2_is_weight_decay = true
+    L2 = 0.01
+    grad_clip = 1.0
+    use_averages = true
+    eps = 1e-8
+    learn_rate = 1e-4
+
+    [corpora]
+
+    [corpora.dev]
+    @readers = "sentence_corpus"
+
+    [corpora.train]
+    @readers = "sentence_corpus"
+    """
+
+
+SENT_STARTS = [0] * 14
+SENT_STARTS[0] = 1
+SENT_STARTS[5] = 1
+SENT_STARTS[9] = 1
+
+TRAIN_DATA = [
+    (
+        "I like green eggs. Eat blue ham. I like purple eggs.",
+        {"sent_starts": SENT_STARTS},
+    ),
+    (
+        "She likes purple eggs. They hate ham. You like yellow eggs.",
+        {"sent_starts": SENT_STARTS},
+    ),
+]
+
+
+@pytest.mark.slow
+def test_distill_loop(config_str):
+    @registry.readers("sentence_corpus")
+    def create_sentence_corpus() -> Callable[[Language], Iterable[Example]]:
+        return SentenceCorpus()
+
+    class SentenceCorpus:
+        def __call__(self, nlp: Language) -> Iterator[Example]:
+            for t in TRAIN_DATA:
+                yield Example.from_dict(nlp.make_doc(t[0]), t[1])
+
+    orig_config = Config().from_str(config_str)
+    teacher = load_model_from_config(orig_config, auto_fill=True, validate=True)
+    teacher.initialize()
+    train(teacher)
+
+    orig_config = Config().from_str(config_str)
+    student = init_nlp_student(orig_config, teacher)
+    student.initialize()
+    distill(teacher, student)
+
+    doc = student(TRAIN_DATA[0][0])
+    assert doc.sents[0].text == "I like green eggs."
+    assert doc.sents[1].text == "Eat blue ham."
+    assert doc.sents[2].text == "I like purple eggs."
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 191821e786e..61ad1c09cc0 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -1,3 +1,9 @@
+from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING
+from thinc.api import Config, ConfigValidationError
+from pathlib import Path
+import srsly
+import numpy
+import tarfile
 import gzip
 import tarfile
 import warnings
@@ -12,22 +18,11 @@
 from thinc.api import Config, ConfigValidationError, fix_random_seed, set_gpu_allocator
 
 from ..errors import Errors, Warnings
-from ..lookups import Lookups
-from ..schemas import ConfigSchemaTraining
-from ..util import (
-    DEFAULT_OOV_PROB,
-    OOV_RANK,
-    ensure_path,
-    get_sourced_components,
-    load_model,
-    load_model_from_config,
-    logger,
-    registry,
-    resolve_dot_names,
-)
-from ..vectors import Mode as VectorsMode
-from ..vectors import Vectors
-from .pretrain import get_tok2vec_ref
+from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
+from ..util import registry, load_model_from_config, resolve_dot_names, logger
+from ..util import load_model, ensure_path, get_sourced_components
+from ..util import OOV_RANK, DEFAULT_OOV_PROB
+from ..util import set_gpu_allocator_from_config, set_seed_from_config
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
@@ -36,15 +31,8 @@
 def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     raw_config = config
     config = raw_config.interpolate()
-    if "seed" not in config["training"]:
-        raise ValueError(Errors.E1015.format(value="[training] seed"))
-    if "gpu_allocator" not in config["training"]:
-        raise ValueError(Errors.E1015.format(value="[training] gpu_allocator"))
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
+    set_seed_from_config(config)
+    set_gpu_allocator_from_config(config, use_gpu)
     # Use original config here before it's resolved to functions
     sourced = get_sourced_components(config)
     nlp = load_model_from_config(raw_config, auto_fill=True)
@@ -111,6 +99,102 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     return nlp
 
 
+def init_nlp_student(
+    config: Config, teacher: "Language", *, use_gpu: int = -1
+) -> "Language":
+    """Initialize student pipeline for distillation.
+
+    config (Config): Student model configuration.
+    teacher (Language): The teacher pipeline to distill from.
+    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
+        before calling this function.
+    """
+    raw_config = config
+    config = raw_config.interpolate()
+    set_seed_from_config(config)
+    set_gpu_allocator_from_config(config, use_gpu)
+
+    # Use original config here before it's resolved to functions
+    sourced = get_sourced_components(config)
+    nlp = load_model_from_config(raw_config, auto_fill=True)
+    logger.info("Set up nlp object from config")
+    config = nlp.config.interpolate()
+    # Resolve all training-relevant sections using the filled nlp config
+    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    D = registry.resolve(config["distillation"], schema=ConfigSchemaDistill)
+    dot_names = [T["dev_corpus"]]
+    if not isinstance(D["corpus"], str):
+        raise ConfigValidationError(
+            desc=Errors.E897.format(field="distillation.corpus", type=type(D["corpus"]))
+        )
+    if not isinstance(T["dev_corpus"], str):
+        raise ConfigValidationError(
+            desc=Errors.E897.format(
+                field="training.dev_corpus", type=type(T["dev_corpus"])
+            )
+        )
+    (dev_corpus,) = resolve_dot_names(config, dot_names)
+    optimizer = T["optimizer"]
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Sourced components that require resume_training
+    resume_components = [p for p in sourced if p not in frozen_components]
+    logger.info(f"Pipeline: {nlp.pipe_names}")
+    if resume_components:
+        with nlp.select_pipes(enable=resume_components):
+            logger.info(f"Resuming training for: {resume_components}")
+            nlp.resume_training(sgd=optimizer)
+    # Make sure that listeners are defined before initializing further
+    nlp._link_components()
+
+    # Get teacher labels to initialize student with.
+    student_to_teacher = D["student_to_teacher"]
+    teacher_pipes = dict(teacher.pipeline)
+    labels = {}
+    for name, pipe in nlp.pipeline:
+        # Copy teacher labels.
+        teacher_pipe_name = (
+            student_to_teacher[name] if name in student_to_teacher else name
+        )
+        teacher_pipe = teacher_pipes.get(teacher_pipe_name, None)
+        if (
+            teacher_pipe is not None
+            and getattr(teacher_pipe, "label_data", None) is not None
+        ):
+            labels[name] = teacher_pipe.label_data  # type: ignore[attr-defined]
+
+    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
+        # Initialize on the dev corpus, since the distillation corpus does
+        # usually not have labels. Since we copy the labels from the teacher
+        # pipe, the dev data does not have to be exhaustive.
+        if T["max_epochs"] == -1:
+            sample_size = 100
+            logger.debug(
+                f"Due to streamed train corpus, using only first {sample_size} "
+                f"examples for initialization. If necessary, provide all labels "
+                f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
+            )
+            nlp.initialize(lambda: islice(dev_corpus(nlp), sample_size), sgd=optimizer)
+        else:
+            nlp.initialize(lambda: dev_corpus(nlp), sgd=optimizer, labels=labels)
+        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
+    # Detect components with listeners that are not frozen consistently
+    for name, proc in nlp.pipeline:
+        for listener in getattr(
+            proc, "listening_components", []
+        ):  # e.g. tok2vec/transformer
+            # Don't warn about components not in the pipeline
+            if listener not in nlp.pipe_names:
+                continue
+            if listener in frozen_components and name not in frozen_components:
+                logger.warning(Warnings.W087.format(name=name, listener=listener))
+            # We always check this regardless, in case user freezes tok2vec
+            if listener not in frozen_components and name in frozen_components:
+                if name not in T["annotating_components"]:
+                    logger.warning(Warnings.W086.format(name=name, listener=listener))
+    return nlp
+
+
 def init_vocab(
     nlp: "Language",
     *,
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 9497b95aba5..ad162678fec 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -3,26 +3,20 @@
 import sys
 from pathlib import Path
 from timeit import default_timer as timer
-from typing import (
-    IO,
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Tuple,
-    Union,
-)
-
-from thinc.api import Config, Optimizer, constant, fix_random_seed, set_gpu_allocator
+from thinc.api import Optimizer, Config, constant
 from wasabi import Printer
+import random
+import sys
+import shutil
+
 
-from ..errors import Errors
-from ..schemas import ConfigSchemaTraining
-from ..util import logger, registry, resolve_dot_names
 from .example import Example
+from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
+from ..errors import Errors
+from ..tokens.doc import Doc
+from .. import ty
+from ..util import resolve_dot_names, registry, logger
+from ..util import set_gpu_allocator_from_config, set_seed_from_config
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
@@ -32,6 +26,129 @@
 DIR_MODEL_LAST = "model-last"
 
 
+def distill(
+    teacher: "Language",
+    student: "Language",
+    output_path: Optional[Path] = None,
+    *,
+    use_gpu: int = -1,
+    stdout: IO = sys.stdout,
+    stderr: IO = sys.stderr,
+) -> Tuple["Language", Optional[Path]]:
+    """Distill a student pipeline from a teacher pipeline.
+
+    teacher (Language): The teacher pipeline to distill from.
+    student (Language): The student pipeline to distill into.
+    output_path (Optional[Path]): Optional output path to save the student
+        model to.
+    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
+        before calling this function.
+    stdout (file): A file-like object to write output messages. To disable
+        printing, set to io.StringIO.
+    stderr (file): A second file-like object to write output messages. To disable
+        printing, set to io.StringIO.
+
+    RETURNS (tuple): The final student nlp object and the path to the exported
+        student model.
+    """
+    # We use no_print here so we can respect the stdout/stderr options.
+    msg = Printer(no_print=True)
+    # Create iterator, which yields out info after each optimization step.
+    config = student.config.interpolate()
+    set_seed_from_config(config)
+    set_gpu_allocator_from_config(config, use_gpu)
+    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    D = registry.resolve(config["distillation"], schema=ConfigSchemaDistill)
+    dot_names = [D["corpus"], T["dev_corpus"]]
+    distill_corpus, dev_corpus = resolve_dot_names(config, dot_names)
+    optimizer = D["optimizer"]
+    score_weights = T["score_weights"]
+    batcher = D["batcher"]
+    train_logger = T["logger"]
+    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+    before_update = T["before_update"]
+    student_to_teacher = D["student_to_teacher"]
+
+    # Helper function to save checkpoints. This is a closure for convenience,
+    # to avoid passing in all the args all the time.
+    def save_checkpoint(is_best):
+        with student.use_params(optimizer.averages):
+            before_to_disk(student).to_disk(output_path / DIR_MODEL_LAST)
+        if is_best:
+            # Avoid saving twice (saving will be more expensive than
+            # the dir copy)
+            if (output_path / DIR_MODEL_BEST).exists():
+                shutil.rmtree(output_path / DIR_MODEL_BEST)
+            shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST)
+
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Components that should set annotations on update
+    annotating_components = T["annotating_components"]
+    # Create iterator, which yields out info after each optimization step.
+    training_step_iterator = _distill_loop(
+        teacher,
+        student,
+        optimizer,
+        create_distill_batches(student, distill_corpus, batcher, D["max_epochs"]),
+        create_evaluation_callback(student, dev_corpus, score_weights),
+        dropout=D["dropout"],
+        accumulate_gradient=T["accumulate_gradient"],
+        max_steps=D["max_steps"],
+        eval_frequency=T["eval_frequency"],
+        exclude=frozen_components,
+        annotating_components=annotating_components,
+        before_update=before_update,
+        student_to_teacher=student_to_teacher,
+    )
+    clean_output_dir(output_path)
+    stdout.write(msg.info(f"Teacher pipeline: {teacher.pipe_names}") + "\n")
+    stdout.write(msg.info(f"Student pipeline: {student.pipe_names}") + "\n")
+    if frozen_components:
+        stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
+    if annotating_components:
+        stdout.write(
+            msg.info(f"Set annotations on update for: {annotating_components}") + "\n"
+        )
+    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate(step=0)}") + "\n")
+    with student.select_pipes(disable=frozen_components):
+        log_step, finalize_logger = train_logger(student, stdout, stderr)
+    try:
+        for batch, info, is_best_checkpoint in training_step_iterator:
+            if is_best_checkpoint is not None:
+                with student.select_pipes(disable=frozen_components):
+                    update_meta(T, student, info)
+                if output_path is not None:
+                    save_checkpoint(is_best_checkpoint)
+                    info["output_path"] = str(output_path / DIR_MODEL_LAST)
+            log_step(info if is_best_checkpoint is not None else None)
+    except Exception as e:
+        if output_path is not None:
+            stdout.write(
+                msg.warn(
+                    f"Aborting and saving the final best model. "
+                    f"Encountered exception: {repr(e)}"
+                )
+                + "\n"
+            )
+        raise e
+    finally:
+        finalize_logger()
+        if output_path is not None:
+            save_checkpoint(False)
+    # This will only run if we did't hit an error
+    if optimizer.averages:
+        student.use_params(optimizer.averages)
+    if output_path is not None:
+        stdout.write(
+            msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST)
+            + "\n"
+        )
+        return (student, output_path / DIR_MODEL_LAST)
+    else:
+        return (student, None)
+
+
 def train(
     nlp: "Language",
     output_path: Optional[Path] = None,
@@ -57,11 +174,8 @@ def train(
     msg = Printer(no_print=True)
     # Create iterator, which yields out info after each optimization step.
     config = nlp.config.interpolate()
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
+    set_seed_from_config(config)
+    set_gpu_allocator_from_config(config, use_gpu)
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     dot_names = [T["train_corpus"], T["dev_corpus"]]
     train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
@@ -150,11 +264,131 @@ def save_checkpoint(is_best):
         return (nlp, None)
 
 
+def _distill_loop(
+    teacher: "Language",
+    student: "Language",
+    optimizer: Optimizer,
+    distill_data: Iterable[List[Example]],
+    evaluate: Callable[[], Tuple[float, Dict[str, float]]],
+    *,
+    dropout: float,
+    eval_frequency: int,
+    accumulate_gradient: int,
+    max_steps: int,
+    exclude: List[str],
+    annotating_components: List[str],
+    before_update: Optional[Callable[["Language", Dict[str, Any]], None]],
+    student_to_teacher: Dict[str, str],
+):
+    """Distill until the data is exhausted or the maximum number of steps
+    has been reached. Works as a generator, with each iteration yielding
+    a tuple `(batch, info, is_best_checkpoint)`, where info is a dict, and
+    is_best_checkpoint is in [True, False, None] -- None indicating that
+    the iteration was not evaluated as a checkpoint. The evaluation is
+    conducted by calling the evaluate callback.
+
+    Positional arguments:
+        teacher (Language): The teacher pipeline to distill from.
+        student (Language): The student pipeline to distill into.
+        optimizer: The optimizer callable.
+        distill_data (Iterable[List[Example]]): A generator of batches,
+            with the distillation data. The distillation data iterable
+            needs to take care of iterating over the epochs and shuffling.
+        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
+            The callback should take no arguments and return a tuple
+            `(main_score, other_scores)`. The main_score should be a float where
+            higher is better. other_scores can be any object.
+
+    Every iteration, the function yields out a tuple with:
+
+    * batch: A list of Example objects.
+    * info: A dict with various information about the last update (see below).
+    * is_best_checkpoint: A value in None, False, True, indicating whether this
+        was the best evaluation so far. You should use this to save the model
+        checkpoints during training. If None, evaluation was not conducted on
+        that iteration. False means evaluation was conducted, but a previous
+        evaluation was better.
+
+    The info dict provides the following information:
+
+        epoch (int): How many passes over the data have been completed.
+        step (int): How many steps have been completed.
+        score (float): The main score from the last evaluation.
+        other_scores: : The other scores from the last evaluation.
+        losses: The accumulated losses throughout training.
+        checkpoints: A list of previous results, where each result is a
+            (score, step, epoch) tuple.
+    """
+    if isinstance(dropout, float):
+        dropouts = constant(dropout)
+    else:
+        dropouts = dropout
+    results = []
+    losses: Dict[str, float] = {}
+    words_seen = 0
+    start_time = timer()
+    for step, (epoch, batch) in enumerate(distill_data):
+        if before_update:
+            before_update_args = {"step": step, "epoch": epoch}
+            before_update(student, before_update_args)
+        dropout = dropouts(optimizer.step)
+        for subbatch in subdivide_batch(batch, accumulate_gradient):
+            student.distill(
+                teacher,
+                subbatch,
+                drop=dropout,
+                losses=losses,
+                sgd=False,
+                exclude=exclude,
+                annotates=annotating_components,
+                student_to_teacher=student_to_teacher,
+            )
+        # TODO: refactor this so we don't have to run it separately in here
+        for student_name, student_proc in student.pipeline:
+            if (
+                student_name not in exclude
+                and isinstance(student_proc, ty.DistillableComponent)
+                and student_proc.is_distillable
+                and student_proc.model not in (False, None)  # type: ignore[attr-defined]
+            ):
+                student_proc.finish_update(optimizer)  # type: ignore[attr-defined]
+        optimizer.step_schedules()
+        if not (step % eval_frequency):
+            if optimizer.averages:
+                with student.use_params(optimizer.averages):
+                    score, other_scores = evaluate()
+            else:
+                score, other_scores = evaluate()
+            optimizer.last_score = score  # type: ignore[assignment]
+            results.append((score, step))
+            is_best_checkpoint = score == max(results)[0]
+        else:
+            score, other_scores = (None, None)
+            is_best_checkpoint = None
+        words_seen += sum(len(eg) for eg in batch)
+        info = {
+            "epoch": epoch,
+            "step": step,
+            "score": score,
+            "other_scores": other_scores,
+            "losses": losses,
+            "checkpoints": results,
+            "seconds": int(timer() - start_time),
+            "words": words_seen,
+        }
+        yield batch, info, is_best_checkpoint
+        if is_best_checkpoint is not None:
+            losses = {}
+        # Stop if we've exhausted our max steps (if specified)
+        if max_steps and step >= max_steps:
+            break
+
+
 def train_while_improving(
     nlp: "Language",
     optimizer: Optimizer,
-    train_data,
-    evaluate,
+    train_data: Iterable[List[Example]],
+    evaluate: Callable[[], Tuple[float, Dict[str, float]]],
     *,
     dropout: float,
     eval_frequency: int,
@@ -174,10 +408,9 @@ def train_while_improving(
     Positional arguments:
         nlp: The spaCy pipeline to evaluate.
         optimizer: The optimizer callable.
-        train_data (Iterable[Batch]): A generator of batches, with the training
-            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
-            data iterable needs to take care of iterating over the epochs and
-            shuffling.
+        train_data (Iterable[List[Example]]): A generator of batches, with the
+            training data. The training data iterable needs to take care of
+            iterating over the epochs and shuffling.
         evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
             The callback should take no arguments and return a tuple
             `(main_score, other_scores)`. The main_score should be a float where
@@ -241,7 +474,7 @@ def train_while_improving(
                     score, other_scores = evaluate()
             else:
                 score, other_scores = evaluate()
-            optimizer.last_score = score
+            optimizer.last_score = score  # type: ignore[assignment]
             results.append((score, step))
             is_best_checkpoint = score == max(results)[0]
         else:
@@ -273,9 +506,15 @@ def train_while_improving(
             break
 
 
-def subdivide_batch(batch, accumulate_gradient):
+def subdivide_batch(
+    batch: Union[Iterable[Doc], Iterable[Example]], accumulate_gradient: int
+):
     batch = list(batch)
-    batch.sort(key=lambda eg: len(eg.predicted))
+    if len(batch):
+        if isinstance(batch[0], Example):
+            batch.sort(key=lambda eg: len(eg.predicted))
+        else:
+            batch.sort(key=lambda doc: len(doc))
     sub_len = len(batch) // accumulate_gradient
     start = 0
     for i in range(accumulate_gradient):
@@ -320,6 +559,22 @@ def evaluate() -> Tuple[float, Dict[str, float]]:
     return evaluate
 
 
+def create_distill_batches(
+    nlp: "Language",
+    corpus: Callable[["Language"], Iterable[Example]],
+    batcher: Callable[[Iterable[Example]], Iterable[List[Example]]],
+    max_epochs: int,
+):
+    """Create distillation batches. In contrast to training, the corpus
+    is normally too large to load into memory and shuffle."""
+    epoch = 0
+    while max_epochs < 1 or epoch != max_epochs:
+        examples = corpus(nlp)
+        for batch in batcher(examples):
+            yield epoch, batch
+        epoch += 1
+
+
 def create_train_batches(
     nlp: "Language",
     corpus: Callable[["Language"], Iterable[Example]],
diff --git a/spacy/util.py b/spacy/util.py
index 7448da8ded0..3bb92e7334c 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -7,6 +7,7 @@
 import thinc
 from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
 from thinc.api import ConfigValidationError, Model, constant as constant_schedule
+from thinc.api import fix_random_seed, set_gpu_allocator
 import functools
 import itertools
 import logging
@@ -1821,3 +1822,22 @@ def find_available_port(start: int, host: str, auto_select: bool = False) -> int
     # if we get here, the port changed
     warnings.warn(Warnings.W124.format(host=host, port=start, serve_port=port))
     return port
+
+
+def set_gpu_allocator_from_config(config: Config, use_gpu: int):
+    """Change the global GPU allocator based to the value in
+    the configuration."""
+    if "gpu_allocator" not in config["training"]:
+        raise ValueError(Errors.E1015.format(value="[training] gpu_allocator"))
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+
+
+def set_seed_from_config(config: Config):
+    """Set the random number generator seed to the value in
+    the configuration."""
+    if "seed" not in config["training"]:
+        raise ValueError(Errors.E1015.format(value="[training] seed"))
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index e38e49bf569..82cb1c14cef 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -347,19 +347,19 @@ Distill the models in a student pipeline from a teacher pipeline.
 > student.distill(teacher, examples, sgd=optimizer)
 > ```
 
-| Name                 | Description                                                                                                                                                                                 |
-| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher`            | The teacher pipeline to distill from. ~~Language~~                                                                                                                                          |
-| `examples`           | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
-| _keyword-only_       |                                                                                                                                                                                             |
-| `drop`               | The dropout rate. ~~float~~                                                                                                                                                                 |
-| `sgd`                | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
-| `losses`             | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                                                             |
-| `component_cfg`      | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                              |
-| `exclude`            | Names of components that shouldn't be updated. Defaults to `[]`. ~~Iterable[str]~~                                                                                                          |
-| `annotates`          | Names of components that should set annotations on the prediced examples after updating. Defaults to `[]`. ~~Iterable[str]~~                                                                |
-| `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                              |
-| **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+| Name                 | Description                                                                                                                                                                                        |
+| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher`            | The teacher pipeline to distill from. ~~Language~~                                                                                                                                                 |
+| `examples`           | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~        |
+| _keyword-only_       |                                                                                                                                                                                                    |
+| `drop`               | The dropout rate. ~~float~~                                                                                                                                                                        |
+| `sgd`                | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ |
+| `losses`             | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                                                                    |
+| `component_cfg`      | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                                     |
+| `exclude`            | Names of components that shouldn't be updated. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                 |
+| `annotates`          | Names of components that should set annotations on the prediced examples after updating. Defaults to `[]`. ~~Iterable[str]~~                                                                       |
+| `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                                     |
+| **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                              |
 
 ## Language.rehearse {id="rehearse",tag="method,experimental",version="3"}
 

From b2d382d3349988fad0c395636a4c71a4e4336e61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 12 Jun 2023 16:16:03 +0200
Subject: [PATCH 182/504] Remove Python 3.7 builds

---
 .github/workflows/tests.yml               | 61 +++++++++++------------
 .github/workflows/universe_validation.yml |  2 +-
 build-constraints.txt                     |  4 +-
 requirements.txt                          |  2 +-
 4 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 840b8e5f968..760a79f2121 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Configure Python version
         uses: actions/setup-python@v4
         with:
-          python-version: "3.7"
+          python-version: "3.8"
           architecture: x64
 
       - name: black
@@ -60,11 +60,9 @@ jobs:
         os: [ubuntu-latest, windows-latest, macos-latest]
         python_version: ["3.12"]
         include:
-          - os: windows-latest
-            python_version: "3.7"
           - os: macos-latest
             python_version: "3.8"
-          - os: ubuntu-latest
+          - os: ubuntu-20.04
             python_version: "3.9"
           - os: windows-latest
             python_version: "3.10"
@@ -95,7 +93,6 @@ jobs:
       - name: Run mypy
         run: |
           python -m mypy spacy
-        if: matrix.python_version != '3.7'
 
       - name: Delete source directory and .egg-info
         run: |
@@ -117,22 +114,22 @@ jobs:
       - name: Test import
         run: python -W error -c "import spacy"
 
-      - name: "Test download CLI"
-        run: |
-          python -m spacy download ca_core_news_sm
-          python -m spacy download ca_core_news_md
-          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
-
-      - name: "Test download_url in info CLI"
-        run: |
-          python -W error -m spacy info ca_core_news_sm | grep -q download_url
-        if: matrix.python_version == '3.9'
-
-      - name: "Test no warnings on load (#11713)"
-        run: |
-          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
+      #      - name: "Test download CLI"
+      #        run: |
+      #          python -m spacy download ca_core_news_sm
+      #          python -m spacy download ca_core_news_md
+      #          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test download_url in info CLI"
+      #        run: |
+      #          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test no warnings on load (#11713)"
+      #        run: |
+      #          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+      #        if: matrix.python_version == '3.9'
 
       - name: "Test convert CLI"
         run: |
@@ -156,17 +153,17 @@ jobs:
           python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
         if: matrix.python_version == '3.9'
 
-      - name: "Test assemble CLI"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-        if: matrix.python_version == '3.9'
-
-      - name: "Test assemble CLI vectors warning"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-        if: matrix.python_version == '3.9'
+      #      - name: "Test assemble CLI"
+      #        run: |
+      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      #          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test assemble CLI vectors warning"
+      #        run: |
+      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+      #          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+      #        if: matrix.python_version == '3.9'
 
       - name: "Install test requirements"
         run: |
diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml
index a1e3253a9ba..c5e68784e00 100644
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@@ -25,7 +25,7 @@ jobs:
       - name: Configure Python version
         uses: actions/setup-python@v4
         with:
-          python-version: "3.7"
+          python-version: "3.8"
           architecture: x64
 
       - name: Validate website/meta/universe.json
diff --git a/build-constraints.txt b/build-constraints.txt
index b1cf596ca7c..781e403c59a 100644
--- a/build-constraints.txt
+++ b/build-constraints.txt
@@ -1,6 +1,4 @@
-# build version constraints for use with wheelwright
-numpy==1.15.0; python_version=='3.7' and platform_machine!='aarch64'
-numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64'
+# build version constraints for use with wheelwright + multibuild
 numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 numpy>=1.25.0; python_version>='3.9'
diff --git a/requirements.txt b/requirements.txt
index a63875eda6a..760f056ebff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<0.1000; platform_machine != "aarch64"
+mypy>=0.990,<1.1.0; platform_machine != "aarch64"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests

From de05e165259606c3e4abed94fe4535df0ea7d162 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 12 Jun 2023 16:43:05 +0200
Subject: [PATCH 183/504] spancat type fixes

---
 spacy/pipeline/spancat.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index bfaaf82e8d0..5c450f36a33 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -514,10 +514,9 @@ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> Non
         indices = activations["indices"]
         assert isinstance(indices, Ragged)
         scores = cast(Floats2d, activations["scores"])
-
         offset = 0
         for i, doc in enumerate(docs):
-            indices_i = indices[i].dataXd
+            indices_i = cast(Ints2d, indices[i].dataXd)
             if self.save_activations:
                 doc.activations[self.name] = {}
                 doc.activations[self.name]["indices"] = indices_i

From b82de947d9c55b500fbc1784f7883dbfb256976c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 22 Jun 2023 15:38:22 +0200
Subject: [PATCH 184/504] Account for differences between Span.sents in spaCy
 3/4

---
 spacy/tokens/span.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index f51a1c5ee3e..6b7782b788b 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -520,13 +520,13 @@ cdef class Span:
                     start = i
                     if start >= self.end:
                         break
-            if start < self.end:
-                spans.append(Span(self.doc, start, self.end))
-        return tuple(spans)
+                elif i == self.doc.length - 1:
+                    spans.append(Span(self.doc, start, self.doc.length))
 
             # Ensure that trailing parts of the Span instance are included in last element of .sents.
             if start == self.doc.length - 1:
-                yield Span(self.doc, start, self.doc.length)
+                spans.append(Span(self.doc, start, self.doc.length))
+        return tuple(spans)
 
     @property
     def ents(self):

From 53a8ef8f35aad0e02b1bf7c335e1d4b9fd277aef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 23 Jun 2023 09:43:41 +0200
Subject: [PATCH 185/504] Set version to v4.0.0.dev1 (#12748)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 1ce8a44c9a4..ec1dde7cae6 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,5 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "4.0.0.dev0"
+__version__ = "4.0.0.dev1"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From ba3d7815096026abad65ab9ce433975a0f0592ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 11:41:03 +0200
Subject: [PATCH 186/504] isort all the things

---
 spacy/cli/__init__.py                         |  21 +-
 spacy/cli/_util.py                            |  21 +-
 spacy/cli/convert.py                          |   4 +-
 spacy/cli/debug_data.py                       |  10 +-
 spacy/cli/download.py                         |  12 +-
 spacy/cli/info.py                             |   2 +-
 spacy/cli/init_config.py                      |  13 +-
 spacy/cli/init_pipeline.py                    |  13 +-
 spacy/cli/project/assets.py                   | 218 +++++++++-
 spacy/cli/project/clone.py                    | 125 +++++-
 spacy/cli/project/document.py                 | 116 +++++-
 spacy/cli/project/dvc.py                      | 221 +++++++++-
 spacy/cli/project/pull.py                     |  68 +++-
 spacy/cli/project/push.py                     |  70 +++-
 spacy/cli/project/remote_storage.py           | 213 +++++++++-
 spacy/cli/project/run.py                      | 380 +++++++++++++++++-
 spacy/displacy/render.py                      |   1 +
 spacy/errors.py                               |   2 +-
 spacy/kb/__init__.py                          |   3 +-
 spacy/kb/candidate.pxd                        |   4 +-
 spacy/kb/candidate.pyx                        |   1 +
 spacy/kb/kb.pyx                               |   2 +-
 spacy/kb/kb_in_memory.pyx                     |   5 +-
 spacy/language.py                             |  61 +--
 spacy/lexeme.pxd                              |  17 +-
 spacy/lexeme.pyx                              |   3 +-
 spacy/matcher/dependencymatcher.pyx           |   2 +-
 spacy/matcher/matcher.pyi                     |  17 +-
 spacy/matcher/matcher.pyx                     |  26 +-
 spacy/matcher/phrasematcher.pyi               |   6 +-
 spacy/matcher/phrasematcher.pyx               |   8 +-
 spacy/ml/models/entity_linker.py              |  16 +-
 spacy/ml/models/parser.py                     |   9 +-
 spacy/ml/models/tok2vec.py                    |   2 -
 spacy/ml/staticvectors.py                     |   6 +-
 spacy/ml/tb_framework.pyx                     |  37 +-
 spacy/morphology.pxd                          |   2 +-
 spacy/morphology.pyx                          |   8 +-
 .../pipeline/_edit_tree_internals/schemas.py  |   8 +-
 .../_parser_internals/_beam_utils.pxd         |   1 +
 .../_parser_internals/_beam_utils.pyx         |  11 +-
 spacy/pipeline/_parser_internals/_state.pxd   |   2 -
 .../pipeline/_parser_internals/arc_eager.pyx  |   5 +-
 spacy/pipeline/_parser_internals/ner.pyx      |   9 +-
 spacy/pipeline/_parser_internals/search.pxd   |   6 +-
 spacy/pipeline/_parser_internals/search.pyx   |   5 +-
 .../pipeline/_parser_internals/stateclass.pyx |   3 +-
 .../_parser_internals/transition_system.pyx   |   4 +-
 spacy/pipeline/attribute_ruler.py             |   2 +-
 spacy/pipeline/dep_parser.py                  |  12 +-
 spacy/pipeline/edit_tree_lemmatizer.py        |  10 +-
 spacy/pipeline/entity_linker.py               |  30 +-
 spacy/pipeline/morphologizer.pyx              |  22 +-
 spacy/pipeline/ner.py                         |  21 +-
 spacy/pipeline/pipe.pyx                       |   4 +-
 spacy/pipeline/sentencizer.pyx                |   4 +-
 spacy/pipeline/senter.pyx                     |  12 +-
 spacy/pipeline/span_ruler.py                  |  10 +-
 spacy/pipeline/spancat.py                     |  20 +-
 spacy/pipeline/tagger.pyx                     |  22 +-
 spacy/pipeline/textcat.py                     |   6 +-
 spacy/pipeline/textcat_multilabel.py          |   6 +-
 spacy/pipeline/tok2vec.py                     |   6 +-
 spacy/pipeline/trainable_pipe.pyx             |  13 +-
 spacy/pipeline/transition_parser.pyx          |  52 ++-
 spacy/schemas.py                              |  51 +--
 spacy/strings.pxd                             |   5 +-
 spacy/strings.pyi                             |   3 +-
 spacy/strings.pyx                             |   6 +-
 spacy/tests/conftest.py                       |   8 +-
 spacy/tests/doc/test_span.py                  |   1 -
 spacy/tests/doc/test_underscore.py            |   1 +
 spacy/tests/parser/_search.pyx                |   7 +-
 spacy/tests/parser/test_ner.py                |   3 +-
 spacy/tests/parser/test_parse.py              |  12 +-
 .../pipeline/test_edit_tree_lemmatizer.py     |   3 +-
 spacy/tests/pipeline/test_entity_linker.py    |   2 +-
 spacy/tests/pipeline/test_entity_ruler.py     |   8 +-
 spacy/tests/pipeline/test_initialize.py       |   7 +-
 spacy/tests/pipeline/test_morphologizer.py    |   3 +-
 spacy/tests/pipeline/test_pipe_factories.py   |   2 +
 spacy/tests/pipeline/test_senter.py           |   1 +
 spacy/tests/pipeline/test_spancat.py          |   7 +-
 spacy/tests/pipeline/test_tagger.py           |   3 +-
 spacy/tests/pipeline/test_textcat.py          |  18 +-
 .../tests/serialize/test_serialize_config.py  |  25 +-
 .../serialize/test_serialize_pipeline.py      |  11 +-
 spacy/tests/test_cli.py                       |  20 +-
 spacy/tests/test_cli_app.py                   |   2 +-
 spacy/tests/test_language.py                  |  16 +-
 spacy/tests/test_misc.py                      |  20 +-
 spacy/tests/test_symbols.py                   |   1 +
 spacy/tests/training/test_loop.py             |   4 +-
 spacy/tests/training/test_training.py         |  15 +-
 spacy/tokenizer.pxd                           |   5 -
 spacy/tokenizer.pyx                           |   8 +-
 spacy/tokens/__init__.py                      |   4 +-
 spacy/tokens/doc.pyi                          |  12 +-
 spacy/tokens/doc.pyx                          |  23 +-
 spacy/tokens/doc_bin.py                       |   4 +-
 spacy/tokens/graph.pyx                        |   6 +-
 spacy/tokens/morphanalysis.pxd                |   7 +-
 spacy/tokens/morphanalysis.pyx                |  10 +-
 spacy/tokens/retokenizer.pyx                  |  10 +-
 spacy/tokens/span.pxd                         |   2 +-
 spacy/tokens/span.pyx                         |  13 +-
 spacy/tokens/span_group.pyx                   |   7 +-
 spacy/tokens/token.pyx                        |   4 +-
 spacy/training/__init__.py                    |  29 +-
 spacy/training/align.pyx                      |   1 -
 spacy/training/batchers.py                    |  13 +
 spacy/training/callbacks.py                   |   6 +-
 spacy/training/converters/json_to_docs.py     |  12 +-
 spacy/training/example.pyx                    |   1 +
 spacy/training/gold_io.pyx                    |   4 +-
 spacy/training/initialize.py                  |  29 +-
 spacy/training/loop.py                        |  34 +-
 spacy/ty.py                                   |  16 +-
 spacy/util.py                                 |  27 +-
 spacy/vectors.pyx                             |  12 +-
 spacy/vocab.pyx                               |   3 +
 121 files changed, 2016 insertions(+), 602 deletions(-)

diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 3095778fe22..b2612f57720 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -16,7 +16,6 @@
 from .debug_model import debug_model  # noqa: F401
 from .download import download  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
-from .find_function import find_function  # noqa: F401
 from .find_threshold import find_threshold  # noqa: F401
 from .info import info  # noqa: F401
 from .init_config import fill_config, init_config  # noqa: F401
@@ -24,17 +23,15 @@
 from .package import package  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .profile import profile  # noqa: F401
-from .project.assets import project_assets  # type: ignore[attr-defined]  # noqa: F401
-from .project.clone import project_clone  # type: ignore[attr-defined]  # noqa: F401
-from .project.document import (  # type: ignore[attr-defined]  # noqa: F401
-    project_document,
-)
-from .project.dvc import project_update_dvc  # type: ignore[attr-defined]  # noqa: F401
-from .project.pull import project_pull  # type: ignore[attr-defined]  # noqa: F401
-from .project.push import project_push  # type: ignore[attr-defined]  # noqa: F401
-from .project.run import project_run  # type: ignore[attr-defined]  # noqa: F401
-from .train import train_cli  # type: ignore[attr-defined]  # noqa: F401
-from .validate import validate  # type: ignore[attr-defined]  # noqa: F401
+from .project.assets import project_assets  # noqa: F401
+from .project.clone import project_clone  # noqa: F401
+from .project.document import project_document  # noqa: F401
+from .project.dvc import project_update_dvc  # noqa: F401
+from .project.pull import project_pull  # noqa: F401
+from .project.push import project_push  # noqa: F401
+from .project.run import project_run  # noqa: F401
+from .train import train_cli  # noqa: F401
+from .validate import validate  # noqa: F401
 
 
 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 52a70cc7320..b005accf91f 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,10 +1,3 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
-from typing import TYPE_CHECKING, overload
-import sys
-import shutil
-from pathlib import Path
-from wasabi import msg, Printer
-import srsly
 import hashlib
 import os
 import shutil
@@ -18,6 +11,7 @@
     Dict,
     Iterable,
     List,
+    Literal,
     Optional,
     Tuple,
     Union,
@@ -32,15 +26,10 @@
 from thinc.util import gpu_is_available
 from typer.main import get_command
 from wasabi import Printer, msg
-from weasel import app as project_cli
 
-from ..schemas import ProjectConfigSchema, validate
-from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
-from ..errors import RENAMED_LANGUAGE_CODES
 from .. import about
-from ..compat import Literal
-from ..schemas import validate
+from ..errors import RENAMED_LANGUAGE_CODES
+from ..schemas import ProjectConfigSchema, validate
 from ..util import (
     ENV_VARS,
     SimpleFrozenDict,
@@ -52,6 +41,10 @@
     run_command,
 )
 
+if TYPE_CHECKING:
+    from pathy import FluidPath  # noqa: F401
+
+
 SDIST_SUFFIX = ".tar.gz"
 WHEEL_SUFFIX = "-py3-none-any.whl"
 
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 3844b340678..a282e59c749 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -8,8 +8,6 @@
 import srsly
 from wasabi import Printer
 
-from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
-from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training import docs_to_json
 from ..training.converters import (
@@ -18,7 +16,7 @@
     iob_to_docs,
     json_to_docs,
 )
-from ._util import Arg, Opt, app, walk_directory
+from ._util import Arg, Opt, _handle_renamed_language_codes, app, walk_directory
 
 # Converters are matched by file extension except for ner/iob, which are
 # matched by file extension and content. To add a converter, add a new
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index c2253b0cb70..4c44a8c0e2b 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,11 +1,3 @@
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
-from typing import Literal, cast, overload
-from pathlib import Path
-from collections import Counter
-import sys
-import srsly
-from wasabi import Printer, MESSAGES, msg
-import typer
 import math
 import sys
 from collections import Counter
@@ -15,6 +7,7 @@
     Dict,
     Iterable,
     List,
+    Literal,
     Optional,
     Sequence,
     Set,
@@ -30,7 +23,6 @@
 from wasabi import MESSAGES, Printer, msg
 
 from .. import util
-from ..compat import Literal
 from ..language import Language
 from ..morphology import Morphology
 from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index ea7a06c5417..0635522930b 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,9 +7,15 @@
 from wasabi import msg
 
 from .. import about
-from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version, get_installed_models
-from ..util import get_package_version
+from ..util import (
+    get_installed_models,
+    get_minor_version,
+    get_package_version,
+    is_package,
+    is_prerelease_version,
+    run_command,
+)
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
 
 
 @app.command(
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 8bfc6b54f15..7a891547e0a 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -1,3 +1,4 @@
+import importlib.metadata
 import json
 import platform
 from pathlib import Path
@@ -7,7 +8,6 @@
 from wasabi import MarkdownRenderer, Printer
 
 from .. import about, util
-from ..compat import importlib_metadata
 from ._util import Arg, Opt, app, string_to_list
 from .download import get_latest_version, get_model_filename
 
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index b29a2b748f2..ca0c316ca20 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -12,9 +12,16 @@
 from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
-from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
-from ._util import string_to_list, import_code, _handle_renamed_language_codes
-
+from ._util import (
+    COMMAND,
+    Arg,
+    Opt,
+    _handle_renamed_language_codes,
+    import_code,
+    init_cli,
+    show_validation_error,
+    string_to_list,
+)
 
 ROOT = Path(__file__).parent / "templates"
 TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 1a044dedbc9..991dc1a822c 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -8,8 +8,17 @@
 
 from .. import util
 from ..language import Language
-from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu, _handle_renamed_language_codes
+from ..training.initialize import convert_vectors, init_nlp
+from ._util import (
+    Arg,
+    Opt,
+    _handle_renamed_language_codes,
+    import_code,
+    init_cli,
+    parse_config_overrides,
+    setup_gpu,
+    show_validation_error,
+)
 
 
 @init_cli.command("vectors")
diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
index 591d1959e73..aa270598621 100644
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@@ -1 +1,217 @@
-from weasel.cli.assets import *
+import os
+import re
+import shutil
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import requests
+import typer
+from wasabi import msg
+
+from ...util import ensure_path, working_dir
+from .._util import (
+    PROJECT_FILE,
+    Arg,
+    Opt,
+    SimpleFrozenDict,
+    download_file,
+    get_checksum,
+    get_git_version,
+    git_checkout,
+    load_project_config,
+    parse_config_overrides,
+    project_cli,
+)
+
+# Whether assets are extra if `extra` is not set.
+EXTRA_DEFAULT = False
+
+
+@project_cli.command(
+    "assets",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def project_assets_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
+    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
+    extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
+    # fmt: on
+):
+    """Fetch project assets like datasets and pretrained weights. Assets are
+    defined in the "assets" section of the project.yml. If a checksum is
+    provided in the project.yml, the file is only downloaded if no local file
+    with the same checksum exists.
+
+    DOCS: https://spacy.io/api/cli#project-assets
+    """
+    overrides = parse_config_overrides(ctx.args)
+    project_assets(
+        project_dir,
+        overrides=overrides,
+        sparse_checkout=sparse_checkout,
+        extra=extra,
+    )
+
+
+def project_assets(
+    project_dir: Path,
+    *,
+    overrides: Dict[str, Any] = SimpleFrozenDict(),
+    sparse_checkout: bool = False,
+    extra: bool = False,
+) -> None:
+    """Fetch assets for a project using DVC if possible.
+
+    project_dir (Path): Path to project directory.
+    sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
+                            needed.
+    extra (bool): Whether to download all assets, including those marked as 'extra'.
+    """
+    project_path = ensure_path(project_dir)
+    config = load_project_config(project_path, overrides=overrides)
+    assets = [
+        asset
+        for asset in config.get("assets", [])
+        if extra or not asset.get("extra", EXTRA_DEFAULT)
+    ]
+    if not assets:
+        msg.warn(
+            f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
+            exits=0,
+        )
+    msg.info(f"Fetching {len(assets)} asset(s)")
+
+    for asset in assets:
+        dest = (project_dir / asset["dest"]).resolve()
+        checksum = asset.get("checksum")
+        if "git" in asset:
+            git_err = (
+                f"Cloning spaCy project templates requires Git and the 'git' command. "
+                f"Make sure it's installed and that the executable is available."
+            )
+            get_git_version(error=git_err)
+            if dest.exists():
+                # If there's already a file, check for checksum
+                if checksum and checksum == get_checksum(dest):
+                    msg.good(
+                        f"Skipping download with matching checksum: {asset['dest']}"
+                    )
+                    continue
+                else:
+                    if dest.is_dir():
+                        shutil.rmtree(dest)
+                    else:
+                        dest.unlink()
+            if "repo" not in asset["git"] or asset["git"]["repo"] is None:
+                msg.fail(
+                    "A git asset must include 'repo', the repository address.", exits=1
+                )
+            if "path" not in asset["git"] or asset["git"]["path"] is None:
+                msg.fail(
+                    "A git asset must include 'path' - use \"\" to get the entire repository.",
+                    exits=1,
+                )
+            git_checkout(
+                asset["git"]["repo"],
+                asset["git"]["path"],
+                dest,
+                branch=asset["git"].get("branch"),
+                sparse=sparse_checkout,
+            )
+            msg.good(f"Downloaded asset {dest}")
+        else:
+            url = asset.get("url")
+            if not url:
+                # project.yml defines asset without URL that the user has to place
+                check_private_asset(dest, checksum)
+                continue
+            fetch_asset(project_path, url, dest, checksum)
+
+
+def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
+    """Check and validate assets without a URL (private assets that the user
+    has to provide themselves) and give feedback about the checksum.
+
+    dest (Path): Destination path of the asset.
+    checksum (Optional[str]): Optional checksum of the expected file.
+    """
+    if not Path(dest).exists():
+        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
+        msg.warn(err)
+    else:
+        if not checksum:
+            msg.good(f"Asset already exists: {dest}")
+        elif checksum == get_checksum(dest):
+            msg.good(f"Asset exists with matching checksum: {dest}")
+        else:
+            msg.fail(f"Asset available but with incorrect checksum: {dest}")
+
+
+def fetch_asset(
+    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
+) -> None:
+    """Fetch an asset from a given URL or path. If a checksum is provided and a
+    local file exists, it's only re-downloaded if the checksum doesn't match.
+
+    project_path (Path): Path to project directory.
+    url (str): URL or path to asset.
+    checksum (Optional[str]): Optional expected checksum of local file.
+    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
+        the asset failed.
+    """
+    dest_path = (project_path / dest).resolve()
+    if dest_path.exists():
+        # If there's already a file, check for checksum
+        if checksum:
+            if checksum == get_checksum(dest_path):
+                msg.good(f"Skipping download with matching checksum: {dest}")
+                return
+        else:
+            # If there's not a checksum, make sure the file is a possibly valid size
+            if os.path.getsize(dest_path) == 0:
+                msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
+                os.remove(dest_path)
+    # We might as well support the user here and create parent directories in
+    # case the asset dir isn't listed as a dir to create in the project.yml
+    if not dest_path.parent.exists():
+        dest_path.parent.mkdir(parents=True)
+    with working_dir(project_path):
+        url = convert_asset_url(url)
+        try:
+            download_file(url, dest_path)
+            msg.good(f"Downloaded asset {dest}")
+        except requests.exceptions.RequestException as e:
+            if Path(url).exists() and Path(url).is_file():
+                # If it's a local file, copy to destination
+                shutil.copy(url, str(dest_path))
+                msg.good(f"Copied local asset {dest}")
+            else:
+                msg.fail(f"Download failed: {dest}", e)
+    if checksum and checksum != get_checksum(dest_path):
+        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
+
+
+def convert_asset_url(url: str) -> str:
+    """Check and convert the asset URL if needed.
+
+    url (str): The asset URL.
+    RETURNS (str): The converted URL.
+    """
+    # If the asset URL is a regular GitHub URL it's likely a mistake
+    if (
+        re.match(r"(http(s?)):\/\/github.com", url)
+        and "releases/download" not in url
+        and "/raw/" not in url
+    ):
+        converted = url.replace("github.com", "raw.githubusercontent.com")
+        converted = re.sub(r"/(tree|blob)/", "/", converted)
+        msg.warn(
+            "Downloading from a regular GitHub URL. This will only download "
+            "the source of the page, not the actual file. Converting the URL "
+            "to a raw URL.",
+            converted,
+        )
+        return converted
+    return url
diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py
index 11d2511a361..2ee27c92adb 100644
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@@ -1 +1,124 @@
-from weasel.cli.clone import *
+import re
+import subprocess
+from pathlib import Path
+from typing import Optional
+
+from wasabi import msg
+
+from ... import about
+from ...util import ensure_path
+from .._util import (
+    COMMAND,
+    PROJECT_FILE,
+    Arg,
+    Opt,
+    get_git_version,
+    git_checkout,
+    git_repo_branch_exists,
+    project_cli,
+)
+
+DEFAULT_REPO = about.__projects__
+DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
+DEFAULT_BRANCHES = ["main", "master"]
+
+
+@project_cli.command("clone")
+def project_clone_cli(
+    # fmt: off
+    name: str = Arg(..., help="The name of the template to clone"),
+    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
+    repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
+    branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
+    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
+    # fmt: on
+):
+    """Clone a project template from a repository. Calls into "git" and will
+    only download the files from the given subdirectory. The GitHub repo
+    defaults to the official spaCy template repo, but can be customized
+    (including using a private repo).
+
+    DOCS: https://spacy.io/api/cli#project-clone
+    """
+    if dest is None:
+        dest = Path.cwd() / Path(name).parts[-1]
+    if repo == DEFAULT_REPO and branch is None:
+        branch = DEFAULT_PROJECTS_BRANCH
+
+    if branch is None:
+        for default_branch in DEFAULT_BRANCHES:
+            if git_repo_branch_exists(repo, default_branch):
+                branch = default_branch
+                break
+        if branch is None:
+            default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
+            msg.fail(
+                "No branch provided and attempted default "
+                f"branches {default_branches_msg} do not exist.",
+                exits=1,
+            )
+    else:
+        if not git_repo_branch_exists(repo, branch):
+            msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
+    assert isinstance(branch, str)
+    project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
+
+
+def project_clone(
+    name: str,
+    dest: Path,
+    *,
+    repo: str = about.__projects__,
+    branch: str = about.__projects_branch__,
+    sparse_checkout: bool = False,
+) -> None:
+    """Clone a project template from a repository.
+
+    name (str): Name of subdirectory to clone.
+    dest (Path): Destination path of cloned project.
+    repo (str): URL of Git repo containing project templates.
+    branch (str): The branch to clone from
+    """
+    dest = ensure_path(dest)
+    check_clone(name, dest, repo)
+    project_dir = dest.resolve()
+    repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
+    try:
+        git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
+    except subprocess.CalledProcessError:
+        err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
+        msg.fail(err, exits=1)
+    msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
+    if not (project_dir / PROJECT_FILE).exists():
+        msg.warn(f"No {PROJECT_FILE} found in directory")
+    else:
+        msg.good(f"Your project is now ready!")
+        print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
+
+
+def check_clone(name: str, dest: Path, repo: str) -> None:
+    """Check and validate that the destination path can be used to clone. Will
+    check that Git is available and that the destination path is suitable.
+
+    name (str): Name of the directory to clone from the repo.
+    dest (Path): Local destination of cloned directory.
+    repo (str): URL of the repo to clone from.
+    """
+    git_err = (
+        f"Cloning spaCy project templates requires Git and the 'git' command. "
+        f"To clone a project without Git, copy the files from the '{name}' "
+        f"directory in the {repo} to {dest} manually."
+    )
+    get_git_version(error=git_err)
+    if not dest:
+        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
+    if dest.exists():
+        # Directory already exists (not allowed, clone needs to create it)
+        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
+    if not dest.parent.exists():
+        # We're not creating parents, parent dir should exist
+        msg.fail(
+            f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
+            f"Create the necessary folder(s) first before continuing.",
+            exits=1,
+        )
diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py
index 1952524a933..80107d27acf 100644
--- a/spacy/cli/project/document.py
+++ b/spacy/cli/project/document.py
@@ -1 +1,115 @@
-from weasel.cli.document import *
+from pathlib import Path
+
+from wasabi import MarkdownRenderer, msg
+
+from ...util import working_dir
+from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
+
+DOCS_URL = "https://spacy.io"
+INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
+project, as well as the available commands and workflows. For details, see the
+[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
+INTRO_COMMANDS = f"""The following commands are defined by the project. They
+can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
+Commands are only re-run if their inputs have changed."""
+INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
+can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
+and will run the specified commands in order. Commands are only re-run if their
+inputs have changed."""
+INTRO_ASSETS = f"""The following assets are defined by the project. They can
+be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
+in the project directory."""
+# These markers are added to the Markdown and can be used to update the file in
+# place if it already exists. Only the auto-generated part will be replaced.
+MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
+MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
+# If this marker is used in an existing README, it's ignored and not replaced
+MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
+
+
+@project_cli.command("document")
+def project_document_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
+    output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
+    no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
+    # fmt: on
+):
+    """
+    Auto-generate a README.md for a project. If the content is saved to a file,
+    hidden markers are added so you can add custom content before or after the
+    auto-generated section and only the auto-generated docs will be replaced
+    when you re-run the command.
+
+    DOCS: https://spacy.io/api/cli#project-document
+    """
+    project_document(project_dir, output_file, no_emoji=no_emoji)
+
+
+def project_document(
+    project_dir: Path, output_file: Path, *, no_emoji: bool = False
+) -> None:
+    is_stdout = str(output_file) == "-"
+    config = load_project_config(project_dir)
+    md = MarkdownRenderer(no_emoji=no_emoji)
+    md.add(MARKER_START)
+    title = config.get("title")
+    description = config.get("description")
+    md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
+    if description:
+        md.add(description)
+    md.add(md.title(2, PROJECT_FILE, "📋"))
+    md.add(INTRO_PROJECT)
+    # Commands
+    cmds = config.get("commands", [])
+    data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
+    if data:
+        md.add(md.title(3, "Commands", "⏯"))
+        md.add(INTRO_COMMANDS)
+        md.add(md.table(data, ["Command", "Description"]))
+    # Workflows
+    wfs = config.get("workflows", {}).items()
+    data = [(md.code(n), " &rarr; ".join(md.code(w) for w in stp)) for n, stp in wfs]
+    if data:
+        md.add(md.title(3, "Workflows", "⏭"))
+        md.add(INTRO_WORKFLOWS)
+        md.add(md.table(data, ["Workflow", "Steps"]))
+    # Assets
+    assets = config.get("assets", [])
+    data = []
+    for a in assets:
+        source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
+        dest_path = a["dest"]
+        dest = md.code(dest_path)
+        if source == "Local":
+            # Only link assets if they're in the repo
+            with working_dir(project_dir) as p:
+                if (p / dest_path).exists():
+                    dest = md.link(dest, dest_path)
+        data.append((dest, source, a.get("description", "")))
+    if data:
+        md.add(md.title(3, "Assets", "🗂"))
+        md.add(INTRO_ASSETS)
+        md.add(md.table(data, ["File", "Source", "Description"]))
+    md.add(MARKER_END)
+    # Output result
+    if is_stdout:
+        print(md.text)
+    else:
+        content = md.text
+        if output_file.exists():
+            with output_file.open("r", encoding="utf8") as f:
+                existing = f.read()
+            if MARKER_IGNORE in existing:
+                msg.warn("Found ignore marker in existing file: skipping", output_file)
+                return
+            if MARKER_START in existing and MARKER_END in existing:
+                msg.info("Found existing file: only replacing auto-generated docs")
+                before = existing.split(MARKER_START)[0]
+                after = existing.split(MARKER_END)[1]
+                content = f"{before}{content}{after}"
+            else:
+                msg.warn("Replacing existing file")
+        with output_file.open("w", encoding="utf8") as f:
+            f.write(content)
+        msg.good("Saved project documentation", output_file)
diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py
index aa1ae7dd9ed..9ad55c43302 100644
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@@ -1 +1,220 @@
-from weasel.cli.dvc import *
+"""This module contains helpers and subcommands for integrating spaCy projects
+with Data Version Controk (DVC). https://dvc.org"""
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+
+from wasabi import msg
+
+from ...util import (
+    SimpleFrozenList,
+    join_command,
+    run_command,
+    split_command,
+    working_dir,
+)
+from .._util import (
+    COMMAND,
+    NAME,
+    PROJECT_FILE,
+    Arg,
+    Opt,
+    get_hash,
+    load_project_config,
+    project_cli,
+)
+
+DVC_CONFIG = "dvc.yaml"
+DVC_DIR = ".dvc"
+UPDATE_COMMAND = "dvc"
+DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
+# edited your {PROJECT_FILE}, you can regenerate this file by running:
+# {COMMAND} project {UPDATE_COMMAND}"""
+
+
+@project_cli.command(UPDATE_COMMAND)
+def project_update_dvc_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
+    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
+    quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
+    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
+    # fmt: on
+):
+    """Auto-generate Data Version Control (DVC) config. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. If no workflow is specified, the first defined
+    workflow is used. The DVC config will only be updated if the project.yml
+    changed.
+
+    DOCS: https://spacy.io/api/cli#project-dvc
+    """
+    project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
+
+
+def project_update_dvc(
+    project_dir: Path,
+    workflow: Optional[str] = None,
+    *,
+    verbose: bool = False,
+    quiet: bool = False,
+    force: bool = False,
+) -> None:
+    """Update the auto-generated Data Version Control (DVC) config file. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. Will only update the file if the checksum changed.
+
+    project_dir (Path): The project directory.
+    workflow (Optional[str]): Optional name of workflow defined in project.yml.
+        If not set, the first workflow will be used.
+    verbose (bool): Print more info.
+    quiet (bool): Print less info.
+    force (bool): Force update DVC config.
+    """
+    config = load_project_config(project_dir)
+    updated = update_dvc_config(
+        project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
+    )
+    help_msg = "To execute the workflow with DVC, run: dvc repro"
+    if updated:
+        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
+    else:
+        msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
+
+
+def update_dvc_config(
+    path: Path,
+    config: Dict[str, Any],
+    workflow: Optional[str] = None,
+    verbose: bool = False,
+    quiet: bool = False,
+    force: bool = False,
+) -> bool:
+    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
+    project directory. The file is auto-generated based on the config. The
+    first line of the auto-generated file specifies the hash of the config
+    dict, so if any of the config values change, the DVC config is regenerated.
+
+    path (Path): The path to the project directory.
+    config (Dict[str, Any]): The loaded project.yml.
+    verbose (bool): Whether to print additional info (via DVC).
+    quiet (bool): Don't output anything (via DVC).
+    force (bool): Force update, even if hashes match.
+    RETURNS (bool): Whether the DVC config file was updated.
+    """
+    ensure_dvc(path)
+    workflows = config.get("workflows", {})
+    workflow_names = list(workflows.keys())
+    check_workflows(workflow_names, workflow)
+    if not workflow:
+        workflow = workflow_names[0]
+    config_hash = get_hash(config)
+    path = path.resolve()
+    dvc_config_path = path / DVC_CONFIG
+    if dvc_config_path.exists():
+        # Check if the file was generated using the current config, if not, redo
+        with dvc_config_path.open("r", encoding="utf8") as f:
+            ref_hash = f.readline().strip().replace("# ", "")
+        if ref_hash == config_hash and not force:
+            return False  # Nothing has changed in project.yml, don't need to update
+        dvc_config_path.unlink()
+    dvc_commands = []
+    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+
+    # some flags that apply to every command
+    flags = []
+    if verbose:
+        flags.append("--verbose")
+    if quiet:
+        flags.append("--quiet")
+
+    for name in workflows[workflow]:
+        command = config_commands[name]
+        deps = command.get("deps", [])
+        outputs = command.get("outputs", [])
+        outputs_no_cache = command.get("outputs_no_cache", [])
+        if not deps and not outputs and not outputs_no_cache:
+            continue
+        # Default to the working dir as the project path since dvc.yaml is auto-generated
+        # and we don't want arbitrary paths in there
+        project_cmd = ["python", "-m", NAME, "project", "run", name]
+        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
+        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
+        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
+
+        dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
+        if command.get("no_skip"):
+            dvc_cmd.append("--always-changed")
+        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
+        dvc_commands.append(join_command(full_cmd))
+
+    if not dvc_commands:
+        # If we don't check for this, then there will be an error when reading the
+        # config, since DVC wouldn't create it.
+        msg.fail(
+            "No usable commands for DVC found. This can happen if none of your "
+            "commands have dependencies or outputs.",
+            exits=1,
+        )
+
+    with working_dir(path):
+        for c in dvc_commands:
+            dvc_command = "dvc " + c
+            run_command(dvc_command)
+    with dvc_config_path.open("r+", encoding="utf8") as f:
+        content = f.read()
+        f.seek(0, 0)
+        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
+    return True
+
+
+def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
+    """Validate workflows provided in project.yml and check that a given
+    workflow can be used to generate a DVC config.
+
+    workflows (List[str]): Names of the available workflows.
+    workflow (Optional[str]): The name of the workflow to convert.
+    """
+    if not workflows:
+        msg.fail(
+            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
+            f"define at least one list of commands.",
+            exits=1,
+        )
+    if workflow is not None and workflow not in workflows:
+        msg.fail(
+            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
+            f"Available workflows: {', '.join(workflows)}",
+            exits=1,
+        )
+    if not workflow:
+        msg.warn(
+            f"No workflow specified for DVC pipeline. Using the first workflow "
+            f"defined in {PROJECT_FILE}: '{workflows[0]}'"
+        )
+
+
+def ensure_dvc(project_dir: Path) -> None:
+    """Ensure that the "dvc" command is available and that the current project
+    directory is an initialized DVC project.
+    """
+    try:
+        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
+    except Exception:
+        msg.fail(
+            "To use spaCy projects with DVC (Data Version Control), DVC needs "
+            "to be installed and the 'dvc' command needs to be available",
+            "You can install the Python package from pip (pip install dvc) or "
+            "conda (conda install -c conda-forge dvc). For more details, see the "
+            "documentation: https://dvc.org/doc/install",
+            exits=1,
+        )
+    if not (project_dir / ".dvc").exists():
+        msg.fail(
+            "Project not initialized as a DVC project",
+            "To initialize a DVC project, you can run 'dvc init' in the project "
+            "directory. For more details, see the documentation: "
+            "https://dvc.org/doc/command-reference/init",
+            exits=1,
+        )
diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index 5e603273d94..e9be74df7f4 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -1 +1,67 @@
-from weasel.cli.pull import *
+from pathlib import Path
+
+from wasabi import msg
+
+from .._util import Arg, load_project_config, logger, project_cli
+from .remote_storage import RemoteStorage, get_command_hash
+from .run import update_lockfile
+
+
+@project_cli.command("pull")
+def project_pull_cli(
+    # fmt: off
+    remote: str = Arg("default", help="Name or path of remote storage"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    # fmt: on
+):
+    """Retrieve available precomputed outputs from a remote storage.
+    You can alias remotes in your project.yml by mapping them to storage paths.
+    A storage can be anything that the smart-open library can upload to, e.g.
+    AWS, Google Cloud Storage, SSH, local directories etc.
+
+    DOCS: https://spacy.io/api/cli#project-pull
+    """
+    for url, output_path in project_pull(project_dir, remote):
+        if url is not None:
+            msg.good(f"Pulled {output_path} from {url}")
+
+
+def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
+    # TODO: We don't have tests for this :(. It would take a bit of mockery to
+    # set up. I guess see if it breaks first?
+    config = load_project_config(project_dir)
+    if remote in config.get("remotes", {}):
+        remote = config["remotes"][remote]
+    storage = RemoteStorage(project_dir, remote)
+    commands = list(config.get("commands", []))
+    # We use a while loop here because we don't know how the commands
+    # will be ordered. A command might need dependencies from one that's later
+    # in the list.
+    while commands:
+        for i, cmd in enumerate(list(commands)):
+            logger.debug("CMD: %s.", cmd["name"])
+            deps = [project_dir / dep for dep in cmd.get("deps", [])]
+            if all(dep.exists() for dep in deps):
+                cmd_hash = get_command_hash("", "", deps, cmd["script"])
+                for output_path in cmd.get("outputs", []):
+                    url = storage.pull(output_path, command_hash=cmd_hash)
+                    logger.debug(
+                        "URL: %s for %s with command hash %s",
+                        url,
+                        output_path,
+                        cmd_hash,
+                    )
+                    yield url, output_path
+
+                out_locs = [project_dir / out for out in cmd.get("outputs", [])]
+                if all(loc.exists() for loc in out_locs):
+                    update_lockfile(project_dir, cmd)
+                # We remove the command from the list here, and break, so that
+                # we iterate over the loop again.
+                commands.pop(i)
+                break
+            else:
+                logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
+        else:
+            # If we didn't break the for loop, break the while loop.
+            break
diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py
index 3a8e8869db1..a7915e54741 100644
--- a/spacy/cli/project/push.py
+++ b/spacy/cli/project/push.py
@@ -1 +1,69 @@
-from weasel.cli.push import *
+from pathlib import Path
+
+from wasabi import msg
+
+from .._util import Arg, load_project_config, logger, project_cli
+from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
+
+
+@project_cli.command("push")
+def project_push_cli(
+    # fmt: off
+    remote: str = Arg("default", help="Name or path of remote storage"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    # fmt: on
+):
+    """Persist outputs to a remote storage. You can alias remotes in your
+    project.yml by mapping them to storage paths. A storage can be anything that
+    the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
+    local directories etc.
+
+    DOCS: https://spacy.io/api/cli#project-push
+    """
+    for output_path, url in project_push(project_dir, remote):
+        if url is None:
+            msg.info(f"Skipping {output_path}")
+        else:
+            msg.good(f"Pushed {output_path} to {url}")
+
+
+def project_push(project_dir: Path, remote: str):
+    """Persist outputs to a remote storage. You can alias remotes in your project.yml
+    by mapping them to storage paths. A storage can be anything that the smart-open
+    library can upload to, e.g. gcs, aws, ssh, local directories etc
+    """
+    config = load_project_config(project_dir)
+    if remote in config.get("remotes", {}):
+        remote = config["remotes"][remote]
+    storage = RemoteStorage(project_dir, remote)
+    for cmd in config.get("commands", []):
+        logger.debug("CMD: %s", cmd["name"])
+        deps = [project_dir / dep for dep in cmd.get("deps", [])]
+        if any(not dep.exists() for dep in deps):
+            logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
+            continue
+        cmd_hash = get_command_hash(
+            "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
+        )
+        logger.debug("CMD_HASH: %s", cmd_hash)
+        for output_path in cmd.get("outputs", []):
+            output_loc = project_dir / output_path
+            if output_loc.exists() and _is_not_empty_dir(output_loc):
+                url = storage.push(
+                    output_path,
+                    command_hash=cmd_hash,
+                    content_hash=get_content_hash(output_loc),
+                )
+                logger.debug(
+                    "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
+                )
+                yield output_path, url
+
+
+def _is_not_empty_dir(loc: Path):
+    if not loc.is_dir():
+        return True
+    elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
+        return True
+    else:
+        return False
diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py
index 29409150fad..84235a90d39 100644
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@@ -1 +1,212 @@
-from weasel.cli.remote_storage import *
+import hashlib
+import os
+import site
+import tarfile
+import urllib.parse
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, List, Optional
+
+from wasabi import msg
+
+from ... import about
+from ...errors import Errors
+from ...git_info import GIT_VERSION
+from ...util import ENV_VARS, check_bool_env_var, get_minor_version
+from .._util import (
+    download_file,
+    ensure_pathy,
+    get_checksum,
+    get_hash,
+    make_tempdir,
+    upload_file,
+)
+
+if TYPE_CHECKING:
+    from pathy import FluidPath  # noqa: F401
+
+
+class RemoteStorage:
+    """Push and pull outputs to and from a remote file storage.
+
+    Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
+    ssh, etc.
+    """
+
+    def __init__(self, project_root: Path, url: str, *, compression="gz"):
+        self.root = project_root
+        self.url = ensure_pathy(url)
+        self.compression = compression
+
+    def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
+        """Compress a file or directory within a project and upload it to a remote
+        storage. If an object exists at the full URL, nothing is done.
+
+        Within the remote storage, files are addressed by their project path
+        (url encoded) and two user-supplied hashes, representing their creation
+        context and their file contents. If the URL already exists, the data is
+        not uploaded. Paths are archived and compressed prior to upload.
+        """
+        loc = self.root / path
+        if not loc.exists():
+            raise IOError(f"Cannot push {loc}: does not exist.")
+        url = self.make_url(path, command_hash, content_hash)
+        if url.exists():
+            return url
+        tmp: Path
+        with make_tempdir() as tmp:
+            tar_loc = tmp / self.encode_name(str(path))
+            mode_string = f"w:{self.compression}" if self.compression else "w"
+            with tarfile.open(tar_loc, mode=mode_string) as tar_file:
+                tar_file.add(str(loc), arcname=str(path))
+            upload_file(tar_loc, url)
+        return url
+
+    def pull(
+        self,
+        path: Path,
+        *,
+        command_hash: Optional[str] = None,
+        content_hash: Optional[str] = None,
+    ) -> Optional["FluidPath"]:
+        """Retrieve a file from the remote cache. If the file already exists,
+        nothing is done.
+
+        If the command_hash and/or content_hash are specified, only matching
+        results are returned. If no results are available, an error is raised.
+        """
+        dest = self.root / path
+        if dest.exists():
+            return None
+        url = self.find(path, command_hash=command_hash, content_hash=content_hash)
+        if url is None:
+            return url
+        else:
+            # Make sure the destination exists
+            if not dest.parent.exists():
+                dest.parent.mkdir(parents=True)
+            tmp: Path
+            with make_tempdir() as tmp:
+                tar_loc = tmp / url.parts[-1]
+                download_file(url, tar_loc)
+                mode_string = f"r:{self.compression}" if self.compression else "r"
+                with tarfile.open(tar_loc, mode=mode_string) as tar_file:
+                    # This requires that the path is added correctly, relative
+                    # to root. This is how we set things up in push()
+
+                    # Disallow paths outside the current directory for the tar
+                    # file (CVE-2007-4559, directory traversal vulnerability)
+                    def is_within_directory(directory, target):
+                        abs_directory = os.path.abspath(directory)
+                        abs_target = os.path.abspath(target)
+                        prefix = os.path.commonprefix([abs_directory, abs_target])
+                        return prefix == abs_directory
+
+                    def safe_extract(tar, path):
+                        for member in tar.getmembers():
+                            member_path = os.path.join(path, member.name)
+                            if not is_within_directory(path, member_path):
+                                raise ValueError(Errors.E852)
+                        tar.extractall(path)
+
+                    safe_extract(tar_file, self.root)
+        return url
+
+    def find(
+        self,
+        path: Path,
+        *,
+        command_hash: Optional[str] = None,
+        content_hash: Optional[str] = None,
+    ) -> Optional["FluidPath"]:
+        """Find the best matching version of a file within the storage,
+        or `None` if no match can be found. If both the creation and content hash
+        are specified, only exact matches will be returned. Otherwise, the most
+        recent matching file is preferred.
+        """
+        name = self.encode_name(str(path))
+        urls = []
+        if command_hash is not None and content_hash is not None:
+            url = self.url / name / command_hash / content_hash
+            urls = [url] if url.exists() else []
+        elif command_hash is not None:
+            if (self.url / name / command_hash).exists():
+                urls = list((self.url / name / command_hash).iterdir())
+        else:
+            if (self.url / name).exists():
+                for sub_dir in (self.url / name).iterdir():
+                    urls.extend(sub_dir.iterdir())
+                if content_hash is not None:
+                    urls = [url for url in urls if url.parts[-1] == content_hash]
+        if len(urls) >= 2:
+            try:
+                urls.sort(key=lambda x: x.stat().last_modified)  # type: ignore
+            except Exception:
+                msg.warn(
+                    "Unable to sort remote files by last modified. The file(s) "
+                    "pulled from the cache may not be the most recent."
+                )
+        return urls[-1] if urls else None
+
+    def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
+        """Construct a URL from a subpath, a creation hash and a content hash."""
+        return self.url / self.encode_name(str(path)) / command_hash / content_hash
+
+    def encode_name(self, name: str) -> str:
+        """Encode a subpath into a URL-safe name."""
+        return urllib.parse.quote_plus(name)
+
+
+def get_content_hash(loc: Path) -> str:
+    return get_checksum(loc)
+
+
+def get_command_hash(
+    site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
+) -> str:
+    """Create a hash representing the execution of a command. This includes the
+    currently installed packages, whatever environment variables have been marked
+    as relevant, and the command.
+    """
+    if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
+        spacy_v = GIT_VERSION
+    else:
+        spacy_v = str(get_minor_version(about.__version__) or "")
+    dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
+    hashes = [spacy_v, site_hash, env_hash] + dep_checksums
+    hashes.extend(cmd)
+    creation_bytes = "".join(hashes).encode("utf8")
+    return hashlib.md5(creation_bytes).hexdigest()
+
+
+def get_site_hash():
+    """Hash the current Python environment's site-packages contents, including
+    the name and version of the libraries. The list we're hashing is what
+    `pip freeze` would output.
+    """
+    site_dirs = site.getsitepackages()
+    if site.ENABLE_USER_SITE:
+        site_dirs.extend(site.getusersitepackages())
+    packages = set()
+    for site_dir in site_dirs:
+        site_dir = Path(site_dir)
+        for subpath in site_dir.iterdir():
+            if subpath.parts[-1].endswith("dist-info"):
+                packages.add(subpath.parts[-1].replace(".dist-info", ""))
+    package_bytes = "".join(sorted(packages)).encode("utf8")
+    return hashlib.md5sum(package_bytes).hexdigest()
+
+
+def get_env_hash(env: Dict[str, str]) -> str:
+    """Construct a hash of the environment variables that will be passed into
+    the commands.
+
+    Values in the env dict may be references to the current os.environ, using
+    the syntax $ENV_VAR to mean os.environ[ENV_VAR]
+    """
+    env_vars = {}
+    for key, value in env.items():
+        if value.startswith("$"):
+            env_vars[key] = os.environ.get(value[1:], "")
+        else:
+            env_vars[key] = value
+    return get_hash(env_vars)
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index cc6a5ac4256..43972a2026a 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -1 +1,379 @@
-from weasel.cli.run import *
+import os.path
+import sys
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
+
+import srsly
+import typer
+from wasabi import msg
+from wasabi.util import locale_escape
+
+from ... import about
+from ...git_info import GIT_VERSION
+from ...util import (
+    ENV_VARS,
+    SimpleFrozenDict,
+    SimpleFrozenList,
+    check_bool_env_var,
+    is_cwd,
+    is_minor_version_match,
+    join_command,
+    run_command,
+    split_command,
+    working_dir,
+)
+from .._util import (
+    COMMAND,
+    PROJECT_FILE,
+    PROJECT_LOCK,
+    Arg,
+    Opt,
+    get_checksum,
+    get_hash,
+    load_project_config,
+    parse_config_overrides,
+    project_cli,
+)
+
+
+@project_cli.command(
+    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
+)
+def project_run_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
+    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
+    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
+    # fmt: on
+):
+    """Run a named command or workflow defined in the project.yml. If a workflow
+    name is specified, all commands in the workflow are run, in order. If
+    commands define dependencies and/or outputs, they will only be re-run if
+    state has changed.
+
+    DOCS: https://spacy.io/api/cli#project-run
+    """
+    if show_help or not subcommand:
+        print_run_help(project_dir, subcommand)
+    else:
+        overrides = parse_config_overrides(ctx.args)
+        project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
+
+
+def project_run(
+    project_dir: Path,
+    subcommand: str,
+    *,
+    overrides: Dict[str, Any] = SimpleFrozenDict(),
+    force: bool = False,
+    dry: bool = False,
+    capture: bool = False,
+    skip_requirements_check: bool = False,
+) -> None:
+    """Run a named script defined in the project.yml. If the script is part
+    of the default pipeline (defined in the "run" section), DVC is used to
+    execute the command, so it can determine whether to rerun it. It then
+    calls into "exec" to execute it.
+
+    project_dir (Path): Path to project directory.
+    subcommand (str): Name of command to run.
+    overrides (Dict[str, Any]): Optional config overrides.
+    force (bool): Force re-running, even if nothing changed.
+    dry (bool): Perform a dry run and don't execute commands.
+    capture (bool): Whether to capture the output and errors of individual commands.
+        If False, the stdout and stderr will not be redirected, and if there's an error,
+        sys.exit will be called with the return code. You should use capture=False
+        when you want to turn over execution to the command, and capture=True
+        when you want to run the command more like a function.
+    skip_requirements_check (bool): Whether to skip the requirements check.
+    """
+    config = load_project_config(project_dir, overrides=overrides)
+    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+    workflows = config.get("workflows", {})
+    validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
+
+    req_path = project_dir / "requirements.txt"
+    if not skip_requirements_check:
+        if config.get("check_requirements", True) and os.path.exists(req_path):
+            with req_path.open() as requirements_file:
+                _check_requirements([req.strip() for req in requirements_file])
+
+    if subcommand in workflows:
+        msg.info(f"Running workflow '{subcommand}'")
+        for cmd in workflows[subcommand]:
+            project_run(
+                project_dir,
+                cmd,
+                overrides=overrides,
+                force=force,
+                dry=dry,
+                capture=capture,
+                skip_requirements_check=True,
+            )
+    else:
+        cmd = commands[subcommand]
+        for dep in cmd.get("deps", []):
+            if not (project_dir / dep).exists():
+                err = f"Missing dependency specified by command '{subcommand}': {dep}"
+                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
+                err_exits = 1 if not dry else None
+                msg.fail(err, err_help, exits=err_exits)
+        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
+        with working_dir(project_dir) as current_dir:
+            msg.divider(subcommand)
+            rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
+            if not rerun and not force:
+                msg.info(f"Skipping '{cmd['name']}': nothing changed")
+            else:
+                run_commands(cmd["script"], dry=dry, capture=capture)
+                if not dry:
+                    update_lockfile(current_dir, cmd)
+
+
+def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
+    """Simulate a CLI help prompt using the info available in the project.yml.
+
+    project_dir (Path): The project directory.
+    subcommand (Optional[str]): The subcommand or None. If a subcommand is
+        provided, the subcommand help is shown. Otherwise, the top-level help
+        and a list of available commands is printed.
+    """
+    config = load_project_config(project_dir)
+    config_commands = config.get("commands", [])
+    commands = {cmd["name"]: cmd for cmd in config_commands}
+    workflows = config.get("workflows", {})
+    project_loc = "" if is_cwd(project_dir) else project_dir
+    if subcommand:
+        validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
+        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
+        if subcommand in commands:
+            help_text = commands[subcommand].get("help")
+            if help_text:
+                print(f"\n{help_text}\n")
+        elif subcommand in workflows:
+            steps = workflows[subcommand]
+            print(f"\nWorkflow consisting of {len(steps)} commands:")
+            steps_data = [
+                (f"{i + 1}. {step}", commands[step].get("help", ""))
+                for i, step in enumerate(steps)
+            ]
+            msg.table(steps_data)
+            help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
+            print(f"For command details, run: {help_cmd}")
+    else:
+        print("")
+        title = config.get("title")
+        if title:
+            print(f"{locale_escape(title)}\n")
+        if config_commands:
+            print(f"Available commands in {PROJECT_FILE}")
+            print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
+            msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
+        if workflows:
+            print(f"Available workflows in {PROJECT_FILE}")
+            print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
+            msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
+
+
+def run_commands(
+    commands: Iterable[str] = SimpleFrozenList(),
+    silent: bool = False,
+    dry: bool = False,
+    capture: bool = False,
+) -> None:
+    """Run a sequence of commands in a subprocess, in order.
+
+    commands (List[str]): The string commands.
+    silent (bool): Don't print the commands.
+    dry (bool): Perform a dry run and don't execut anything.
+    capture (bool): Whether to capture the output and errors of individual commands.
+        If False, the stdout and stderr will not be redirected, and if there's an error,
+        sys.exit will be called with the return code. You should use capture=False
+        when you want to turn over execution to the command, and capture=True
+        when you want to run the command more like a function.
+    """
+    for c in commands:
+        command = split_command(c)
+        # Not sure if this is needed or a good idea. Motivation: users may often
+        # use commands in their config that reference "python" and we want to
+        # make sure that it's always executing the same Python that spaCy is
+        # executed with and the pip in the same env, not some other Python/pip.
+        # Also ensures cross-compatibility if user 1 writes "python3" (because
+        # that's how it's set up on their system), and user 2 without the
+        # shortcut tries to re-run the command.
+        if len(command) and command[0] in ("python", "python3"):
+            command[0] = sys.executable
+        elif len(command) and command[0] in ("pip", "pip3"):
+            command = [sys.executable, "-m", "pip", *command[1:]]
+        if not silent:
+            print(f"Running command: {join_command(command)}")
+        if not dry:
+            run_command(command, capture=capture)
+
+
+def validate_subcommand(
+    commands: Sequence[str], workflows: Sequence[str], subcommand: str
+) -> None:
+    """Check that a subcommand is valid and defined. Raises an error otherwise.
+
+    commands (Sequence[str]): The available commands.
+    subcommand (str): The subcommand.
+    """
+    if not commands and not workflows:
+        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
+    if subcommand not in commands and subcommand not in workflows:
+        help_msg = []
+        if subcommand in ["assets", "asset"]:
+            help_msg.append("Did you mean to run: python -m spacy project assets?")
+        if commands:
+            help_msg.append(f"Available commands: {', '.join(commands)}")
+        if workflows:
+            help_msg.append(f"Available workflows: {', '.join(workflows)}")
+        msg.fail(
+            f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
+            ". ".join(help_msg),
+            exits=1,
+        )
+
+
+def check_rerun(
+    project_dir: Path,
+    command: Dict[str, Any],
+    *,
+    check_spacy_version: bool = True,
+    check_spacy_commit: bool = False,
+) -> bool:
+    """Check if a command should be rerun because its settings or inputs/outputs
+    changed.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    strict_version (bool):
+    RETURNS (bool): Whether to re-run the command.
+    """
+    # Always rerun if no-skip is set
+    if command.get("no_skip", False):
+        return True
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():  # We don't have a lockfile, run command
+        return True
+    data = srsly.read_yaml(lock_path)
+    if command["name"] not in data:  # We don't have info about this command
+        return True
+    entry = data[command["name"]]
+    # Always run commands with no outputs (otherwise they'd always be skipped)
+    if not entry.get("outs", []):
+        return True
+    # Always rerun if spaCy version or commit hash changed
+    spacy_v = entry.get("spacy_version")
+    commit = entry.get("spacy_git_version")
+    if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
+        info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
+        msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
+        return True
+    if check_spacy_commit and commit != GIT_VERSION:
+        info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
+        msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
+        return True
+    # If the entry in the lockfile matches the lockfile entry that would be
+    # generated from the current command, we don't rerun because it means that
+    # all inputs/outputs, hashes and scripts are the same and nothing changed
+    lock_entry = get_lock_entry(project_dir, command)
+    exclude = ["spacy_version", "spacy_git_version"]
+    return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
+
+
+def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
+    """Update the lockfile after running a command. Will create a lockfile if
+    it doesn't yet exist and will add an entry for the current command, its
+    script and dependencies/outputs.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    """
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():
+        srsly.write_yaml(lock_path, {})
+        data = {}
+    else:
+        data = srsly.read_yaml(lock_path)
+    data[command["name"]] = get_lock_entry(project_dir, command)
+    srsly.write_yaml(lock_path, data)
+
+
+def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
+    """Get a lockfile entry for a given command. An entry includes the command,
+    the script (command steps) and a list of dependencies and outputs with
+    their paths and file hashes, if available. The format is based on the
+    dvc.lock files, to keep things consistent.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    RETURNS (Dict[str, Any]): The lockfile entry.
+    """
+    deps = get_fileinfo(project_dir, command.get("deps", []))
+    outs = get_fileinfo(project_dir, command.get("outputs", []))
+    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
+    return {
+        "cmd": f"{COMMAND} run {command['name']}",
+        "script": command["script"],
+        "deps": deps,
+        "outs": [*outs, *outs_nc],
+        "spacy_version": about.__version__,
+        "spacy_git_version": GIT_VERSION,
+    }
+
+
+def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
+    """Generate the file information for a list of paths (dependencies, outputs).
+    Includes the file path and the file's checksum.
+
+    project_dir (Path): The current project directory.
+    paths (List[str]): The file paths.
+    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
+    """
+    data = []
+    for path in paths:
+        file_path = project_dir / path
+        md5 = get_checksum(file_path) if file_path.exists() else None
+        data.append({"path": path, "md5": md5})
+    return data
+
+
+def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
+    """Checks whether requirements are installed and free of version conflicts.
+    requirements (List[str]): List of requirements.
+    RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
+        exist.
+    """
+    import pkg_resources
+
+    failed_pkgs_msgs: List[str] = []
+    conflicting_pkgs_msgs: List[str] = []
+
+    for req in requirements:
+        try:
+            pkg_resources.require(req)
+        except pkg_resources.DistributionNotFound as dnf:
+            failed_pkgs_msgs.append(dnf.report())
+        except pkg_resources.VersionConflict as vc:
+            conflicting_pkgs_msgs.append(vc.report())
+        except Exception:
+            msg.warn(
+                f"Unable to check requirement: {req} "
+                "Checks are currently limited to requirement specifiers "
+                "(PEP 508)"
+            )
+
+    if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
+        msg.warn(
+            title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
+            "correctly and you installed all requirements specified in your project's requirements.txt: "
+        )
+        for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
+            msg.text(pgk_msg)
+
+    return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 40b9986e85b..b7f689bcb3e 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -1,3 +1,4 @@
+import itertools
 import uuid
 from typing import Any, Dict, List, Optional, Tuple, Union
 
diff --git a/spacy/errors.py b/spacy/errors.py
index fe067f7915d..4909371d549 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,5 +1,5 @@
-from typing import Literal
 import warnings
+from typing import Literal
 
 
 class ErrorsWithCodes(type):
diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
index 7155c15df9a..2aa084ef52a 100644
--- a/spacy/kb/__init__.py
+++ b/spacy/kb/__init__.py
@@ -1,6 +1,5 @@
-from .candidate import Candidate, get_candidates, get_candidates_batch
+from .candidate import Candidate, InMemoryCandidate
 from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB
-from .candidate import Candidate, InMemoryCandidate
 
 __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd
index f21f423e496..4419ed47666 100644
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@@ -1,6 +1,8 @@
 from libcpp.vector cimport vector
-from .kb_in_memory cimport InMemoryLookupKB
+
 from ..typedefs cimport hash_t
+from .kb_in_memory cimport InMemoryLookupKB
+
 
 cdef class Candidate:
     pass
diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx
index bf66ccfae67..1739cfa64f6 100644
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@@ -1,6 +1,7 @@
 # cython: infer_types=True
 
 from .kb_in_memory cimport InMemoryLookupKB
+
 from ..errors import Errors
 
 
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index bb58bf88a46..c3479eabc18 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -5,7 +5,7 @@ from typing import Iterable, Tuple, Union
 
 from cymem.cymem cimport Pool
 
-from .candidate import Candidate
+from ..errors import Errors
 from ..tokens import Span, SpanGroup
 from ..util import SimpleFrozenList
 from .candidate import Candidate
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index 3aab0d73e72..fee407e68b2 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -1,5 +1,5 @@
-# cython: infer_types=True
-from typing import Any, Callable, Dict, Iterable
+# cython: infer_types=True, profile=True
+from typing import Any, Callable, Dict, Iterable, Union
 
 import srsly
 
@@ -22,6 +22,7 @@ from ..util import SimpleFrozenList, ensure_path
 
 from ..vocab cimport Vocab
 from .kb cimport KnowledgeBase
+
 from .candidate import InMemoryCandidate
 
 
diff --git a/spacy/language.py b/spacy/language.py
index 028f733200e..ea641224684 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,12 +1,4 @@
-from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Literal
-from typing import Union, Tuple, List, Set, Pattern, Sequence
-from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
-
-from dataclasses import dataclass
-import random
-import itertools
 import functools
-import inspect
 import itertools
 import multiprocessing as mp
 import random
@@ -25,6 +17,7 @@
     Iterable,
     Iterator,
     List,
+    Literal,
     NoReturn,
     Optional,
     Pattern,
@@ -37,29 +30,41 @@
     overload,
 )
 
-from . import ty
-from .tokens.underscore import Underscore
-from .vocab import Vocab, create_vocab
-from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .training import Example, validate_examples, validate_distillation_examples
-from .training.initialize import init_vocab, init_tok2vec
-from .scorer import Scorer
-from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
-from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
-from .util import warn_if_jupyter_cupy
-from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
-from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
-from .lang.punctuation import TOKENIZER_INFIXES
-from .tokens import Doc
-from .tokenizer import Tokenizer
+import srsly
+from thinc.api import Config, CupyOps, Optimizer, get_current_ops
+
+from . import about, ty, util
 from .errors import Errors, Warnings
-from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
-from .schemas import ConfigSchemaPretrain, validate_init_settings
 from .git_info import GIT_VERSION
-from . import util
-from . import about
+from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .lang.tokenizer_exceptions import BASE_EXCEPTIONS, URL_MATCH
 from .lookups import load_lookups
-
+from .pipe_analysis import analyze_pipes, print_pipe_analysis, validate_attrs
+from .schemas import (
+    ConfigSchema,
+    ConfigSchemaInit,
+    ConfigSchemaNlp,
+    ConfigSchemaPretrain,
+    validate_init_settings,
+)
+from .scorer import Scorer
+from .tokenizer import Tokenizer
+from .tokens import Doc
+from .tokens.underscore import Underscore
+from .training import Example, validate_distillation_examples, validate_examples
+from .training.initialize import init_tok2vec, init_vocab
+from .util import (
+    _DEFAULT_EMPTY_PIPES,
+    CONFIG_SECTION_ORDER,
+    SimpleFrozenDict,
+    SimpleFrozenList,
+    _pipe,
+    combine_score_weights,
+    raise_error,
+    registry,
+    warn_if_jupyter_cupy,
+)
+from .vocab import Vocab, create_vocab
 
 PipeCallable = Callable[[Doc], Doc]
 
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 2d14edcd6b0..ff51d77e8a9 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,10 +1,19 @@
 from numpy cimport ndarray
 
-from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
-from .attrs cimport attr_id_t
-from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
-
+from .attrs cimport (
+    ID,
+    LANG,
+    LENGTH,
+    LOWER,
+    NORM,
+    ORTH,
+    PREFIX,
+    SHAPE,
+    SUFFIX,
+    attr_id_t,
+)
 from .structs cimport LexemeC
+from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
 from .vocab cimport Vocab
 
 
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 41fc8f1d2b1..92ef3b16259 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -2,6 +2,7 @@
 # cython: profile=False
 # Compiler crashes on memory view coercion without this. Should report bug.
 cimport numpy as np
+from cython.view cimport array as cvarray
 from libc.string cimport memset
 
 np.import_array()
@@ -35,7 +36,7 @@ from .typedefs cimport attr_t, flags_t
 from .attrs import intify_attrs
 from .errors import Errors, Warnings
 
-OOV_RANK = 0xffffffffffffffff  # UINT64_MAX
+OOV_RANK = 0xffffffffffffffff # UINT64_MAX
 memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 EMPTY_LEXEME.id = OOV_RANK
 
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 0b639ab04fb..60299603623 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True
+# cython: infer_types=True, profile=True
 import warnings
 from collections import defaultdict
 from itertools import product
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index a0b6d91e7d5..fe2d8bec3bc 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -1,6 +1,17 @@
-from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Literal
-from typing import Iterator, Iterable, overload
-from ..vocab import Vocab
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+    overload,
+)
+
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 7e734ac247e..8accd8c4465 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -1,4 +1,4 @@
-# cython: binding=True, infer_types=True
+# cython: binding=True, infer_types=True, profile=True
 from typing import Iterable, List
 
 from cymem.cymem cimport Pool
@@ -12,23 +12,35 @@ import warnings
 
 import srsly
 
-from ..attrs cimport DEP, ENT_IOB, ID, LEMMA, MORPH, NULL_ATTR, POS, TAG
+from ..attrs cimport (
+    DEP,
+    ENT_IOB,
+    ID,
+    LEMMA,
+    MORPH,
+    NULL_ATTR,
+    ORTH,
+    POS,
+    TAG,
+    attr_id_t,
+)
 from ..structs cimport TokenC
 from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.morphanalysis cimport MorphAnalysis
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
+from ..vocab cimport Vocab
 
-from ..schemas import validate_token_pattern
-from ..errors import Errors, MatchPatternError, Warnings
-from ..strings cimport get_string_id
-from ..attrs import IDS
 from ..errors import Errors, MatchPatternError, Warnings
 from ..schemas import validate_token_pattern
-from ..strings import get_string_id
 from .levenshtein import levenshtein_compare
 
+from ..strings cimport get_string_id
+
+from ..attrs import IDS
+from ..util import registry
+
 DEF PADDING = 5
 
 
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index 45685db228a..d3c679a65d5 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -1,7 +1,5 @@
-from typing import List, Tuple, Union, Optional, Callable, Any, Dict, Literal
-from typing import overload
-from .matcher import Matcher
-from ..vocab import Vocab
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, overload
+
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 from .matcher import Matcher
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 6e3c52924fa..107d7d926ee 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,15 +1,17 @@
 # cython: infer_types=True, profile=True
-from typing import List
 from collections import defaultdict
+from typing import List
+
 from libc.stdint cimport uintptr_t
-from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
+from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
 
 import warnings
 
-from ..attrs cimport DEP, LEMMA, MORPH, POS, TAG
+from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG
 
 from ..attrs import IDS
 
+from ..structs cimport TokenC
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index db960fbd0a9..987eb6733d3 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -14,21 +14,9 @@
 )
 from thinc.types import Floats2d
 
-from ...util import registry
-from ...kb import KnowledgeBase, InMemoryLookupKB
-from ...kb import Candidate
-from ...vocab import Vocab
-from ...tokens import Doc, Span, SpanGroup
-from ..extract_spans import extract_spans
 from ...errors import Errors
-from ...kb import (
-    Candidate,
-    InMemoryLookupKB,
-    KnowledgeBase,
-    get_candidates,
-    get_candidates_batch,
-)
-from ...tokens import Doc, Span
+from ...kb import Candidate, InMemoryLookupKB, KnowledgeBase
+from ...tokens import Doc, Span, SpanGroup
 from ...util import registry
 from ...vocab import Vocab
 from ..extract_spans import extract_spans
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 01312983d86..422abf4e260 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,12 +1,13 @@
-from typing import Optional, List, Tuple, Any, Literal
-from thinc.types import Floats2d
-from thinc.api import Model
 import warnings
+from typing import Any, List, Literal, Optional, Tuple
+
+from thinc.api import Model
+from thinc.types import Floats2d
 
 from ...errors import Errors, Warnings
+from ...tokens.doc import Doc
 from ...util import registry
 from ..tb_framework import TransitionModel
-from ...tokens.doc import Doc
 
 TransitionSystem = Any  # TODO
 State = Any  # TODO
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index a605d32cd40..61bc7291e2e 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -22,8 +22,6 @@
 from ...attrs import intify_attr
 from ...errors import Errors
 from ...ml import character_embed
-from ..staticvectors import StaticVectors
-from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
 from ...tokens import Doc
 from ...util import registry
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 1a1b0a0fffd..3b9a9ce2dd1 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -1,4 +1,3 @@
-import warnings
 from typing import Callable, List, Optional, Sequence, Tuple, cast
 
 from thinc.api import Model, Ops, registry
@@ -6,10 +5,9 @@
 from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 from thinc.util import partial
 
-from ..attrs import ORTH
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..tokens import Doc
-from ..vectors import Mode, Vectors
+from ..vectors import Mode
 from ..vocab import Vocab
 
 
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 9b2114900d3..fd0af12ceab 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -1,28 +1,45 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import List, Tuple, Any, Optional, TypeVar, cast
-from libc.string cimport memset, memcpy
+from typing import Any, List, Optional, Tuple, TypeVar, cast
+
 from libc.stdlib cimport calloc, free, realloc
+from libc.string cimport memcpy, memset
 from libcpp.vector cimport vector
+
 import numpy
+
 cimport numpy as np
-from thinc.api import Model, normal_init, chain, list2array, Linear
-from thinc.api import uniform_init, glorot_uniform_init, zero_init
-from thinc.api import NumpyOps
+
+from thinc.api import (
+    Linear,
+    Model,
+    NumpyOps,
+    chain,
+    glorot_uniform_init,
+    list2array,
+    normal_init,
+    uniform_init,
+    zero_init,
+)
+
 from thinc.backends.cblas cimport CBlas, saxpy, sgemm
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
-from thinc.types import Ints1d, Ints2d
+
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
 
 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
 from ..pipeline._parser_internals.batch import GreedyBatch
+
 from ..pipeline._parser_internals._parser_utils cimport arg_max
-from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
-from ..pipeline._parser_internals.transition_system cimport TransitionSystem
 from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..pipeline._parser_internals.transition_system cimport (
+    TransitionSystem,
+    c_apply_actions,
+    c_transition_batch,
+)
+
 from ..tokens.doc import Doc
 from ..util import registry
 
-
 State = Any  # TODO
 
 
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 494088879b1..5138d353cf0 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,8 +1,8 @@
 cimport numpy as np
 from libc.stdint cimport uint32_t, uint64_t
+from libcpp.memory cimport shared_ptr
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.memory cimport shared_ptr
 
 from .strings cimport StringStore
 from .structs cimport MorphAnalysisC
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 7ee621056f1..d75c1071941 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,14 +1,14 @@
 # cython: infer_types
-# cython: profile=False
 import warnings
-from typing import Union, Tuple, List, Dict, Optional
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy
+
 from cython.operator cimport dereference as deref
 from libcpp.memory cimport shared_ptr
 
-from .errors import Warnings
 from . import symbols
 from .errors import Warnings
-from .parts_of_speech import IDS as POS_IDS
 
 
 cdef class Morphology:
diff --git a/spacy/pipeline/_edit_tree_internals/schemas.py b/spacy/pipeline/_edit_tree_internals/schemas.py
index 89f2861ceac..1e307b66cb9 100644
--- a/spacy/pipeline/_edit_tree_internals/schemas.py
+++ b/spacy/pipeline/_edit_tree_internals/schemas.py
@@ -1,12 +1,8 @@
 from collections import defaultdict
 from typing import Any, Dict, List, Union
 
-try:
-    from pydantic.v1 import BaseModel, Field, ValidationError
-    from pydantic.v1.types import StrictBool, StrictInt, StrictStr
-except ImportError:
-    from pydantic import BaseModel, Field, ValidationError  # type: ignore
-    from pydantic.types import StrictBool, StrictInt, StrictStr  # type: ignore
+from pydantic import BaseModel, Field, ValidationError
+from pydantic.types import StrictBool, StrictInt, StrictStr
 
 
 class MatchNodeSchema(BaseModel):
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pxd b/spacy/pipeline/_parser_internals/_beam_utils.pxd
index 571f246b1e3..5a452e56a88 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd
@@ -1,5 +1,6 @@
 from ...typedefs cimport class_t, hash_t
 
+
 # These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
 
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index c86de231d09..7098b822ef0 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -1,14 +1,21 @@
 # cython: infer_types=True
+# cython: profile=True
+cimport numpy as np
+
 import numpy
-from cpython.ref cimport PyObject, Py_XDECREF
 
-from ...typedefs cimport class_t
+from cpython.ref cimport Py_XDECREF, PyObject
+
+from ...typedefs cimport class_t, hash_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
+
 from .batch cimport Batch
 from .search cimport Beam, MaxViolation
+
 from .search import MaxViolation
+
 from .stateclass cimport StateC, StateClass
 
 
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 1b6b25e5f16..1c61ac271d8 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,5 +1,4 @@
 cimport libcpp
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as incr
 from libc.stdint cimport uint32_t, uint64_t
@@ -8,7 +7,6 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 673e36bf5ac..08f60b2634b 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -1,4 +1,4 @@
-# cython: cdivision=True, infer_types=True
+# cython: profile=True, cdivision=True, infer_types=True
 from cymem.cymem cimport Address, Pool
 from libc.stdint cimport int32_t
 from libcpp.vector cimport vector
@@ -9,7 +9,7 @@ from ...strings cimport hash_string
 from ...structs cimport TokenC
 from ...tokens.doc cimport Doc, set_children_from_heads
 from ...tokens.token cimport MISSING_DEP
-from ...typedefs cimport attr_t
+from ...typedefs cimport attr_t, hash_t
 
 from ...training import split_bilu_label
 
@@ -18,6 +18,7 @@ from ._state cimport ArcC, StateC
 from .stateclass cimport StateClass
 
 from ...errors import Errors
+
 from .search cimport Beam
 
 
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index cf19c834ed9..5c31ff5c21d 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -1,10 +1,10 @@
 import os
 import random
+
+from cymem.cymem cimport Pool
 from libc.stdint cimport int32_t
 from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
-from cymem.cymem cimport Pool
-from libc.stdint cimport int32_t
 
 from collections import Counter
 
@@ -14,16 +14,15 @@ from ...tokens.span import Span
 
 from ...attrs cimport IS_SPACE
 from ...lexeme cimport Lexeme
-from ...structs cimport SpanC
+from ...structs cimport SpanC, TokenC
 from ...tokens.span cimport Span
 from ...typedefs cimport attr_t, weight_t
 
 from ...training import split_bilu_label
 
 from ...training.example cimport Example
-from .search cimport Beam
-from .stateclass cimport StateClass
 from ._state cimport StateC
+from .search cimport Beam
 from .stateclass cimport StateClass
 from .transition_system cimport Transition, do_func_t
 
diff --git a/spacy/pipeline/_parser_internals/search.pxd b/spacy/pipeline/_parser_internals/search.pxd
index dfe30e1c130..4626496335a 100644
--- a/spacy/pipeline/_parser_internals/search.pxd
+++ b/spacy/pipeline/_parser_internals/search.pxd
@@ -1,12 +1,10 @@
 from cymem.cymem cimport Pool
-
-from libc.stdint cimport uint32_t
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint32_t, uint64_t
 from libcpp.pair cimport pair
 from libcpp.queue cimport priority_queue
 from libcpp.vector cimport vector
 
-from ...typedefs cimport class_t, weight_t, hash_t
+from ...typedefs cimport class_t, hash_t, weight_t
 
 ctypedef pair[weight_t, size_t] Entry
 ctypedef priority_queue[Entry] Queue
diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
index 1d9b6dd7adf..251eaa805cb 100644
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -1,7 +1,8 @@
 # cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
-from libc.string cimport memset, memcpy
-from libc.math cimport log, exp
+from libc.math cimport exp, log
+from libc.string cimport memcpy, memset
+
 import math
 
 from cymem.cymem cimport Pool
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index e49ff63c48b..c2a0d22956a 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -1,5 +1,6 @@
 # cython: infer_types=True
-# cython: profile=False
+import numpy
+
 from libcpp.vector cimport vector
 
 from ...tokens.doc cimport Doc
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index d1340d68c62..a433ce7dc75 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -11,9 +11,11 @@ from collections import Counter
 import srsly
 
 from ...structs cimport TokenC
+from ...tokens.doc cimport Doc
 from ...typedefs cimport attr_t, weight_t
-from .stateclass cimport StateClass
+from . cimport _beam_utils
 from ._parser_utils cimport arg_max_if_valid
+from .stateclass cimport StateClass
 
 from ... import util
 from ...errors import Errors
diff --git a/spacy/pipeline/attribute_ruler.py b/spacy/pipeline/attribute_ruler.py
index 126a48945bc..76f82b84e38 100644
--- a/spacy/pipeline/attribute_ruler.py
+++ b/spacy/pipeline/attribute_ruler.py
@@ -11,7 +11,7 @@
 from ..symbols import IDS
 from ..tokens import Doc, Span
 from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs
-from ..vocab import Vocab
+from ..training import Example
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
 from .pipe import Pipe
diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.py
index 370a698c25a..b4961487b83 100644
--- a/spacy/pipeline/dep_parser.py
+++ b/spacy/pipeline/dep_parser.py
@@ -1,23 +1,19 @@
 # cython: infer_types=True, binding=True
 from collections import defaultdict
-from typing import Callable, Optional
+from typing import Callable, Iterable, Optional
 
 from thinc.api import Config, Model
 
-from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
-from ._parser_internals.arc_eager import ArcEager
-
-from ._parser_internals.arc_eager cimport ArcEager
-from .transition_parser cimport Parser
-
 from ..language import Language
 from ..scorer import Scorer
 from ..training import remove_bilu_prefix
 from ..util import registry
 from ._parser_internals import nonproj
+from ._parser_internals.arc_eager import ArcEager
 from ._parser_internals.nonproj import DELIMITER
+from ._parser_internals.transition_system import TransitionSystem
 from .functions import merge_subtokens
+from .transition_parser import Parser
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index a1bcb98455c..046ef19c3d5 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -1,12 +1,12 @@
 from collections import Counter
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model
-from thinc.types import ArrayXd, Floats2d, Ints1d
+from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
 from ..errors import Errors
@@ -19,10 +19,6 @@
 from .lemmatizer import lemmatizer_score
 from .trainable_pipe import TrainablePipe
 
-# The cutoff value of *top_k* above which an alternative method is used to process guesses.
-TOP_K_GUARDRAIL = 20
-
-
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
 
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 4882ead1d92..287f96d9b97 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,45 +1,27 @@
-import warnings
-from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast
-from numpy import dtype
-from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
-from pathlib import Path
-from itertools import islice
-import srsly
 import random
+import warnings
 from itertools import islice
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Union, cast
 
 import srsly
+from numpy import dtype
 from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate
-from thinc.types import Floats2d
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 
-from ..kb import KnowledgeBase, Candidate
-from ..tokens import Doc, Span
-from ..ml import empty_kb
-from ..tokens import Doc, Span, SpanGroup
-from .pipe import deserialize_config
-from .trainable_pipe import TrainablePipe
-from ..language import Language
-from ..vocab import Vocab
-from ..training import Example, validate_examples, validate_get_examples
-from ..errors import Errors, Warnings
-from ..util import SimpleFrozenList, registry
 from .. import util
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..kb import Candidate, KnowledgeBase
 from ..language import Language
 from ..ml import empty_kb
 from ..scorer import Scorer
-from ..tokens import Doc, Span
+from ..tokens import Doc, Span, SpanGroup
 from ..training import Example, validate_examples, validate_get_examples
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
-from .legacy.entity_linker import EntityLinker_v1
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 
-
 ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
 
 KNOWLEDGE_BASE_IDS = "kb_ids"
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 5e7d0720a40..7259fc02699 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,32 +1,30 @@
 # cython: infer_types=True, profile=True, binding=True
+from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Union
+
 import srsly
-from thinc.api import Model, Config
+from thinc.api import Config, Model
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
-from itertools import islice
-from typing import Callable, Dict, Optional, Union
-
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 
 from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 
-from ..parts_of_speech import IDS as POS_IDS
-from ..symbols import POS
-from ..language import Language
-from ..errors import Errors
-from .pipe import deserialize_config
-from .tagger import ActivationsT, Tagger
 from .. import util
 from ..errors import Errors
 from ..language import Language
 from ..parts_of_speech import IDS as POS_IDS
 from ..scorer import Scorer
+from ..symbols import POS
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import Tagger
+from .pipe import deserialize_config
+from .tagger import ActivationsT, Tagger
+
+# See #9050
+BACKWARD_OVERWRITE = True
+BACKWARD_EXTEND = False
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.py
index 2c5fd89cc5d..1c7cf151385 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.py
@@ -1,25 +1,16 @@
 # cython: infer_types=True, binding=True
 from collections import defaultdict
-from typing import Callable, Optional
+from typing import Callable, Iterable, Optional
 
 from thinc.api import Config, Model
 
-from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
-from ._parser_internals.ner import BiluoPushDown
-from ..language import Language
-from ..scorer import get_ner_prf, PRFScore
-from ..training import validate_examples
-from ..util import registry
-from ..training import remove_bilu_prefix
-
-from ._parser_internals.ner cimport BiluoPushDown
-from .transition_parser cimport Parser
-
 from ..language import Language
-from ..scorer import get_ner_prf
-from ..training import remove_bilu_prefix
+from ..scorer import PRFScore, get_ner_prf
+from ..training import remove_bilu_prefix, validate_examples
 from ..util import registry
+from ._parser_internals.ner import BiluoPushDown
+from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index af7cd09f171..7bc6735a802 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -1,6 +1,6 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
 import warnings
-from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
+from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union
 
 import srsly
 
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 02b92e87812..6dd62ed8577 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
 from typing import Callable, List, Optional
 
 import srsly
@@ -7,9 +7,11 @@ from ..tokens.doc cimport Doc
 
 from .. import util
 from ..language import Language
+from ..scorer import Scorer
 from .pipe import Pipe
 from .senter import senter_score
 
+
 @Language.factory(
     "sentencizer",
     assigns=["token.is_sent_start", "doc.sents"],
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index ba45df28400..42615e194e0 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,25 +1,21 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Dict, Iterable, Optional, Callable, List, Union
 from itertools import islice
-from typing import Callable, Optional
+from typing import Callable, Dict, Iterable, List, Optional, Union
 
 import srsly
-from thinc.api import Model, Config
+from thinc.api import Config, Model
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
-
 from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
 
-from .tagger import ActivationsT, Tagger
-from ..language import Language
+from .. import util
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import Tagger
-
+from .tagger import ActivationsT, Tagger
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index 4875c5e4bff..cd8fea36b47 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -17,20 +17,12 @@
 
 import srsly
 
-from .pipe import Pipe
-from ..training import Example
-from ..language import Language
-from ..errors import Errors, Warnings
-from ..util import ensure_path, SimpleFrozenList, registry
-from ..tokens import Doc, Span
-from ..scorer import Scorer, get_ner_prf
-from ..matcher import Matcher, PhraseMatcher
 from .. import util
 from ..errors import Errors, Warnings
 from ..language import Language
 from ..matcher import Matcher, PhraseMatcher
 from ..matcher.levenshtein import levenshtein_compare
-from ..scorer import Scorer
+from ..scorer import Scorer, get_ner_prf
 from ..tokens import Doc, Span
 from ..training import Example
 from ..util import SimpleFrozenList, ensure_path, registry
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 5c450f36a33..72fd78f461e 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,8 +1,18 @@
-from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
-from typing import Union, Protocol, runtime_checkable
-from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
-from thinc.api import Optimizer
-from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
+from dataclasses import dataclass
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Protocol,
+    Tuple,
+    Union,
+    cast,
+    runtime_checkable,
+)
 
 import numpy
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 8740058174a..f3d0527ea0b 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,29 +1,29 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Callable, Dict, Iterable, List, Optional, Union
-from typing import Tuple
-import numpy
-import srsly
-from thinc.api import Model, set_dropout_rate, Config
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
 import warnings
 from itertools import islice
-from typing import Callable, Optional
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
+import srsly
+from thinc.api import Config, Model, set_dropout_rate
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d, Ints1d
 
+from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
+from ..vocab cimport Vocab
 
 from .. import util
-from ..errors import Errors
+from ..attrs import ID, POS
+from ..errors import Errors, Warnings
 from ..language import Language
+from ..parts_of_speech import X
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
+from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 
-
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
 default_model_config = """
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 6cb33109891..13841dd7bbb 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,9 +1,5 @@
-from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
-from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
-from thinc.types import Floats2d
-import numpy
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy
 from thinc.api import Config, Model, Optimizer, get_array_module, set_dropout_rate
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 9ed9770086c..309b9a84443 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,9 +1,5 @@
-from typing import Iterable, Optional, Dict, List, Callable, Any, Union
-from thinc.types import Floats2d
-from thinc.api import Model, Config
-
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
 from thinc.api import Config, Model
 from thinc.types import Floats2d
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index f168aee2ec4..92aec22b7a7 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -1,10 +1,8 @@
-from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple
-from thinc.api import Model, set_dropout_rate, Optimizer, Config
-from thinc.types import Floats2d
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
 
 from thinc.api import Config, Model, Optimizer, set_dropout_rate
+from thinc.types import Floats2d
 
 from ..errors import Errors
 from ..language import Language
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 97442a1aa97..e7cf566a113 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -1,19 +1,16 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+import warnings
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 
 import srsly
-from thinc.api import set_dropout_rate, Model, Optimizer
-import warnings
+from thinc.api import Model, Optimizer, set_dropout_rate
 
 from ..tokens.doc cimport Doc
 
-from ..training import validate_examples, validate_distillation_examples
-from ..errors import Errors, Warnings
-from .pipe import Pipe, deserialize_config
 from .. import util
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..language import Language
-from ..training import Example, validate_examples
+from ..training import Example, validate_distillation_examples, validate_examples
 from ..vocab import Vocab
 from .pipe import Pipe, deserialize_config
 
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index ef2e3314e85..d521aeced7f 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,49 +1,61 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
+
 from typing import Dict, Iterable, List, Optional, Tuple
-from cymem.cymem cimport Pool
+
 cimport numpy as np
 from cymem.cymem cimport Pool
 
 from itertools import islice
 
 from libc.stdlib cimport calloc, free
-from libc.string cimport memset
+from libc.string cimport memcpy, memset
 from libcpp.vector cimport vector
 
-import random
 import contextlib
+import random
+import warnings
 
-import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
-from thinc.api import chain, softmax_activation, use_ops, get_array_module
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
-import numpy.random
 import numpy
 import numpy.random
 import srsly
-from thinc.api import CupyOps, NumpyOps, set_dropout_rate
+from thinc.api import (
+    CupyOps,
+    NumpyOps,
+    Optimizer,
+    chain,
+    get_array_module,
+    get_ops,
+    set_dropout_rate,
+    softmax_activation,
+    use_ops,
+)
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d, Ints1d
 
 from ..ml.tb_framework import TransitionModelInputs
-from ._parser_internals.stateclass cimport StateC, StateClass
-from ._parser_internals.search cimport Beam
+
 from ..tokens.doc cimport Doc
-from .trainable_pipe cimport TrainablePipe
 from ._parser_internals cimport _beam_utils
+from ._parser_internals.search cimport Beam
+from ._parser_internals.stateclass cimport StateC, StateClass
+from .trainable_pipe cimport TrainablePipe
+
 from ._parser_internals import _beam_utils
+
+from ..typedefs cimport weight_t
 from ..vocab cimport Vocab
 from ._parser_internals.transition_system cimport Transition, TransitionSystem
-from ..typedefs cimport weight_t
 
-from ..training import validate_examples, validate_get_examples
-from ..training import validate_distillation_examples
-from ..errors import Errors, Warnings
 from .. import util
-from ..errors import Errors
-from ..training import validate_examples, validate_get_examples
-from ._parser_internals import _beam_utils
+from ..errors import Errors, Warnings
+from ..training import (
+    validate_distillation_examples,
+    validate_examples,
+    validate_get_examples,
+)
+
 
 # TODO: Remove when we switch to Cython 3.
 cdef extern from "<algorithm>" namespace "std" nogil:
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 7fc5ec20e51..4372e3f5e2e 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,12 +1,3 @@
-from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
-from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
-from enum import Enum
-from pydantic import BaseModel, Field, ValidationError, validator, create_model
-from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
-from pydantic.main import ModelMetaclass
-from thinc.api import Optimizer, ConfigValidationError, Model
-from thinc.config import Promise
-from collections import defaultdict
 import inspect
 import re
 from collections import defaultdict
@@ -18,6 +9,7 @@
     Dict,
     Iterable,
     List,
+    Literal,
     Optional,
     Tuple,
     Type,
@@ -25,34 +17,19 @@
     Union,
 )
 
-try:
-    from pydantic.v1 import (
-        BaseModel,
-        ConstrainedStr,
-        Field,
-        StrictBool,
-        StrictFloat,
-        StrictInt,
-        StrictStr,
-        ValidationError,
-        create_model,
-        validator,
-    )
-    from pydantic.v1.main import ModelMetaclass
-except ImportError:
-    from pydantic import (  # type: ignore
-        BaseModel,
-        ConstrainedStr,
-        Field,
-        StrictBool,
-        StrictFloat,
-        StrictInt,
-        StrictStr,
-        ValidationError,
-        create_model,
-        validator,
-    )
-    from pydantic.main import ModelMetaclass  # type: ignore
+from pydantic import (
+    BaseModel,
+    ConstrainedStr,
+    Field,
+    StrictBool,
+    StrictFloat,
+    StrictInt,
+    StrictStr,
+    ValidationError,
+    create_model,
+    validator,
+)
+from pydantic.main import ModelMetaclass
 from thinc.api import ConfigValidationError, Model, Optimizer
 from thinc.config import Promise
 
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index b734a707c54..c05731c9a15 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,8 +1,5 @@
-from libc.stdint cimport int64_t, uint32_t
-from libcpp.vector cimport vector
-from libcpp.set cimport set
 from cymem.cymem cimport Pool
-from libc.stdint cimport int64_t
+from libc.stdint cimport int64_t, uint32_t
 from libcpp.set cimport set
 from libcpp.vector cimport vector
 from murmurhash.mrmr cimport hash64
diff --git a/spacy/strings.pyi b/spacy/strings.pyi
index 393661f591d..98224fcd449 100644
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@@ -1,6 +1,5 @@
-from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overload
 from pathlib import Path
-from typing import Any, Iterable, Iterator, Optional, Union, overload
+from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union, overload
 
 class StringStore:
     def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ...
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 73e4c46ed46..43826f07c44 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,7 +1,10 @@
 # cython: infer_types=True
-from typing import Optional, Union, Iterable, Tuple, Callable, Any, List, Iterator
+from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
+
 cimport cython
 from libc.stdint cimport uint32_t
+from libc.string cimport memcpy
+from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 import srsly
@@ -14,7 +17,6 @@ from .symbols import IDS as SYMBOLS_BY_STR
 from .symbols import NAMES as SYMBOLS_BY_INT
 
 
-
 cdef class StringStore:
     """Look up strings by 64-bit hashes. Implicitly handles reserved symbols.
 
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index fdc9f192c2f..28551f9ee63 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -1,11 +1,11 @@
-import pytest
-from spacy.util import get_lang_class
 import functools
-from hypothesis import settings
-import inspect
 import importlib
+import inspect
 import sys
 
+import pytest
+from hypothesis import settings
+
 from spacy.util import get_lang_class
 
 # Functionally disable deadline settings for tests
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 0b05ca7c123..cf850a2234d 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -6,7 +6,6 @@
 from spacy.attrs import LENGTH, ORTH
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span, SpanGroup, Token
-from spacy.vocab import Vocab
 from spacy.util import filter_spans
 from spacy.vocab import Vocab
 
diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index ca5c2ad3959..3ab7de76323 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -4,6 +4,7 @@
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
+
 # Helper functions
 def _get_tuple(s: Span):
     return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
index 23fc8164412..0983159b75d 100644
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@@ -1,11 +1,14 @@
 # cython: infer_types=True, binding=True
+from cymem.cymem cimport Pool
+
 from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
 from spacy.typedefs cimport class_t, weight_t
-from cymem.cymem cimport Pool
 
-from ..conftest import cytest
 import pytest
 
+from ..conftest import cytest
+
+
 cdef struct TestState:
     int length
     int x
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 2c520b7daf6..f0efc3a63fd 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -3,6 +3,7 @@
 
 import pytest
 from numpy.testing import assert_equal
+from thinc.api import fix_random_seed
 
 from spacy import registry, util
 from spacy.attrs import ENT_IOB
@@ -16,8 +17,6 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
-from thinc.api import fix_random_seed
-import logging
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 4c709932bb1..636bb887789 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,17 +1,19 @@
 import itertools
-import pytest
+
 import numpy
+import pytest
 from numpy.testing import assert_equal
-from thinc.api import Adam
+from thinc.api import Adam, fix_random_seed
 
 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.training import Example
+from spacy.pipeline import DependencyParser
+from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.tokens import Doc
+from spacy.training import Example
 from spacy.vocab import Vocab
-from spacy import util, registry
-from thinc.api import fix_random_seed
 
 from ..util import apply_transition_sequence, make_tempdir
 
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index 0f204ead477..7465c844492 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -1,5 +1,5 @@
-from typing import cast
 import pickle
+from typing import cast
 
 import hypothesis.strategies as st
 import pytest
@@ -10,7 +10,6 @@
 from spacy.language import Language
 from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
 from spacy.pipeline.trainable_pipe import TrainablePipe
-from spacy.training import Example
 from spacy.strings import StringStore
 from spacy.training import Example
 from spacy.util import make_tempdir
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 170f2215f83..fe7335600b4 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,4 +1,4 @@
-from typing import Callable, Iterable, Dict, Any, cast
+from typing import Any, Callable, Dict, Iterable, cast
 
 import pytest
 from numpy.testing import assert_equal
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 6bff3288dc3..520012c5075 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -2,16 +2,10 @@
 from thinc.api import NumpyOps, get_current_ops
 
 from spacy import registry
-from spacy.tokens import Doc, Span
-from spacy.language import Language
-from spacy.lang.en import English
-from spacy.pipeline import EntityRecognizer, merge_entities
-from spacy.pipeline import SpanRuler
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import EntityRecognizer, EntityRuler, SpanRuler, merge_entities
+from spacy.pipeline import EntityRecognizer, SpanRuler, merge_entities
 from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.tests.util import make_tempdir
 from spacy.tokens import Doc, Span
diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py
index 9854b391e60..6dd4114f1cd 100644
--- a/spacy/tests/pipeline/test_initialize.py
+++ b/spacy/tests/pipeline/test_initialize.py
@@ -1,10 +1,5 @@
 import pytest
-
-try:
-    from pydantic.v1 import StrictBool
-except ImportError:
-    from pydantic import StrictBool  # type: ignore
-
+from pydantic import StrictBool
 from thinc.api import ConfigValidationError
 
 from spacy.lang.en import English
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index fffb7b4ed7f..542d14d1516 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -1,4 +1,5 @@
 from typing import cast
+
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import get_current_ops
@@ -9,7 +10,7 @@
 from spacy.language import Language
 from spacy.morphology import Morphology
 from spacy.pipeline import TrainablePipe
-from spacy.attrs import MORPH
+from spacy.tests.util import make_tempdir
 from spacy.tokens import Doc
 from spacy.training import Example
 
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index c45dccb0624..9e1382ebd8c 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -1,4 +1,6 @@
 import pytest
+from pydantic import StrictInt, StrictStr
+from thinc.api import ConfigValidationError, Linear, Model
 
 try:
     from pydantic.v1 import StrictInt, StrictStr
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 94285178310..51f943898f1 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -1,4 +1,5 @@
 from typing import cast
+
 import pytest
 from numpy.testing import assert_equal
 
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 5dcc2e70f67..42eb90a1bb1 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -1,6 +1,7 @@
 import numpy
-from numpy.testing import assert_array_equal, assert_almost_equal
-from thinc.api import get_current_ops, Ragged, fix_random_seed
+import pytest
+from numpy.testing import assert_almost_equal, assert_array_equal
+from thinc.api import NumpyOps, Ragged, fix_random_seed, get_current_ops
 
 from spacy import util
 from spacy.lang.en import English
@@ -8,7 +9,7 @@
 from spacy.tokens import SpanGroup
 from spacy.tokens.span_groups import SpanGroups
 from spacy.training import Example
-from spacy.util import registry, make_tempdir
+from spacy.util import make_tempdir, registry
 
 OPS = get_current_ops()
 
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index b6f94f7f97b..05e814f0733 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,4 +1,5 @@
 from typing import cast
+
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import compounding, get_current_ops
@@ -8,7 +9,7 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
-from thinc.api import compounding
+from spacy.training import Example
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 2383c36bb01..3f2d757eebc 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,5 +1,5 @@
-from typing import cast
 import random
+from typing import cast
 
 import numpy.random
 import pytest
@@ -13,12 +13,16 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TextCategorizer, TrainablePipe
-from spacy.pipeline.textcat import single_label_bow_config
-from spacy.pipeline.textcat import single_label_cnn_config
-from spacy.pipeline.textcat import single_label_default_config
-from spacy.pipeline.textcat_multilabel import multi_label_bow_config
-from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
-from spacy.pipeline.textcat_multilabel import multi_label_default_config
+from spacy.pipeline.textcat import (
+    single_label_bow_config,
+    single_label_cnn_config,
+    single_label_default_config,
+)
+from spacy.pipeline.textcat_multilabel import (
+    multi_label_bow_config,
+    multi_label_cnn_config,
+    multi_label_default_config,
+)
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, DocBin
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index eb0dcc1e38c..646ce0f5d48 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -5,14 +5,25 @@
 import spacy
 from spacy.lang.de import German
 from spacy.lang.en import English
-from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
-from spacy.language import DEFAULT_CONFIG_DISTILL_PATH
-from spacy.language import Language
-from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
-from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
+from spacy.language import (
+    DEFAULT_CONFIG,
+    DEFAULT_CONFIG_DISTILL_PATH,
+    DEFAULT_CONFIG_PRETRAIN_PATH,
+    Language,
+)
+from spacy.ml.models import (
+    MaxoutWindowEncoder,
+    MultiHashEmbed,
+    build_tb_parser_model,
+    build_Tok2Vec_model,
+)
 from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
-from spacy.util import load_config, load_config_from_str
-from spacy.util import load_model_from_config, registry
+from spacy.util import (
+    load_config,
+    load_config_from_str,
+    load_model_from_config,
+    registry,
+)
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 39fbbf58217..d5f2f13af4f 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -8,9 +8,14 @@
 from spacy import Vocab, load, registry
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import DependencyParser, EntityRecognizer
-from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer
-from spacy.pipeline import TrainablePipe
+from spacy.pipeline import (
+    DependencyParser,
+    EntityRecognizer,
+    SentenceRecognizer,
+    Tagger,
+    TextCategorizer,
+    TrainablePipe,
+)
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 7b729d78f21..a47f03e8ab4 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,19 +1,31 @@
 import math
 import os
+import time
 from collections import Counter
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 
+import numpy
 import pytest
 import srsly
 from click import NoSuchOption
 from packaging.specifiers import SpecifierSet
-from thinc.api import Config
+from thinc.api import Config, ConfigValidationError
 
 import spacy
 from spacy import about
-from spacy.cli import download_module, info
-from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory
+from spacy.cli import info
+from spacy.cli._util import (
+    download_file,
+    is_subpath_of,
+    load_project_config,
+    parse_config_overrides,
+    string_to_list,
+    substitute_project_variables,
+    upload_file,
+    validate_project_commands,
+    walk_directory,
+)
 from spacy.cli.apply import apply
 from spacy.cli.debug_data import (
     _compile_gold,
@@ -31,6 +43,8 @@
 from spacy.cli.init_config import RECOMMENDATIONS, fill_config, init_config
 from spacy.cli.init_pipeline import _init_labels
 from spacy.cli.package import _is_permitted_package_name, get_third_party_dependencies
+from spacy.cli.project.remote_storage import RemoteStorage
+from spacy.cli.project.run import _check_requirements
 from spacy.cli.validate import get_model_pkgs
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 1789d60ea4c..32ca639b37d 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -7,7 +7,7 @@
 from typer.testing import CliRunner
 
 from spacy.cli._util import app, get_git_version
-from spacy.tokens import Doc, DocBin, Span
+from spacy.tokens import Doc, DocBin
 
 from .util import make_tempdir, normalize_whitespace
 
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index e4b06893c93..25352d2bb16 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -4,7 +4,7 @@
 from unittest import mock
 
 import pytest
-from thinc.api import CupyOps, NumpyOps, get_current_ops
+from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
 
 import spacy
 from spacy.lang.de import German
@@ -13,12 +13,14 @@
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, Span
 from spacy.training import Example
-from spacy.lang.en import English
-from spacy.lang.de import German
-from spacy.util import registry, ignore_error, raise_error, find_matching_language
-from spacy.util import load_model_from_config
-import spacy
-from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
+from spacy.util import (
+    find_matching_language,
+    ignore_error,
+    load_model_from_config,
+    raise_error,
+    registry,
+)
+from spacy.vocab import Vocab
 
 from .util import add_vecs_to_vocab, assert_docs_equal
 
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 160e2ecf335..c05ef625e11 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,13 +1,19 @@
 import ctypes
 import os
 from pathlib import Path
-from spacy.about import __version__ as spacy_version
-from spacy import util
-from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
-from spacy.util import find_available_port
-from thinc.api import Config, Optimizer, ConfigValidationError
-from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
+
+import pytest
+from pydantic import ValidationError
+from thinc.api import (
+    Config,
+    ConfigValidationError,
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+    set_current_ops,
+)
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
diff --git a/spacy/tests/test_symbols.py b/spacy/tests/test_symbols.py
index fb034accac2..2c2fcef755e 100644
--- a/spacy/tests/test_symbols.py
+++ b/spacy/tests/test_symbols.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.symbols import IDS, NAMES
 
 V3_SYMBOLS = {
diff --git a/spacy/tests/training/test_loop.py b/spacy/tests/training/test_loop.py
index 46d01509504..9140421b46b 100644
--- a/spacy/tests/training/test_loop.py
+++ b/spacy/tests/training/test_loop.py
@@ -1,11 +1,13 @@
 from typing import Callable, Iterable, Iterator
+
 import pytest
+from thinc.api import Config
+
 from spacy import Language
 from spacy.training import Example
 from spacy.training.initialize import init_nlp_student
 from spacy.training.loop import distill, train
 from spacy.util import load_model_from_config, registry
-from thinc.api import Config
 
 
 @pytest.fixture
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index ef20ec365c6..e8a19947606 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -8,10 +8,17 @@
 import spacy
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
-from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
-from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
-from spacy.training import offsets_to_biluo_tags, validate_distillation_examples
-from spacy.training.alignment_array import AlignmentArray
+from spacy.training import (
+    Alignment,
+    Corpus,
+    Example,
+    biluo_tags_to_offsets,
+    biluo_tags_to_spans,
+    docs_to_json,
+    iob_to_biluo,
+    offsets_to_biluo_tags,
+    validate_distillation_examples,
+)
 from spacy.training.align import get_alignments
 from spacy.training.alignment_array import AlignmentArray
 from spacy.training.converters import json_to_docs
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 58d30c3202f..b2e50969462 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -2,12 +2,7 @@ from cymem.cymem cimport Pool
 from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 
-from .typedefs cimport hash_t
-from .structs cimport LexemeC, SpanC, TokenC
-from .tokens.doc cimport Doc
-from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
-from .strings cimport StringStore
 from .structs cimport LexemeC, SpanC, TokenC
 from .tokens.doc cimport Doc
 from .typedefs cimport hash_t
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 12a78d39fc4..94397b22d9d 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -1,4 +1,4 @@
-# cython: embedsignature=True, binding=True
+# cython: embedsignature=True, profile=True, binding=True
 cimport cython
 from cymem.cymem cimport Pool
 from cython.operator cimport dereference as deref
@@ -9,17 +9,11 @@ from preshed.maps cimport PreshMap
 
 import re
 
-from .tokens.doc cimport Doc
-from .strings cimport hash_string
 from .lexeme cimport EMPTY_LEXEME
 from .strings cimport hash_string
 from .tokens.doc cimport Doc
 
-from .attrs import intify_attrs
-from .symbols import ORTH, NORM
-from .errors import Errors
 from . import util
-from .util import get_words_and_spaces
 from .attrs import intify_attrs
 from .errors import Errors
 from .scorer import Scorer
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index e5a244360e3..7617e462fde 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -1,9 +1,9 @@
 from ._serialize import DocBin
 from .doc import Doc
+from .doc_bin import DocBin
 from .morphanalysis import MorphAnalysis
 from .span import Span
 from .span_group import SpanGroup
-from .doc_bin import DocBin
-from .morphanalysis import MorphAnalysis
+from .token import Token
 
 __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 2b39d5baa28..dc7c0143029 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -8,7 +8,6 @@ from typing import (
     List,
     Optional,
     Protocol,
-    Sequence,
     Tuple,
     Union,
     overload,
@@ -17,20 +16,15 @@ from typing import (
 import numpy as np
 from cymem.cymem import Pool
 from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
-from .span import Span
-from .token import Token
-from .span_groups import SpanGroups
-from .retokenizer import Retokenizer
+
 from ..lexeme import Lexeme
 from ..vocab import Vocab
-from ._dict_proxies import SpanGroups
-from ._retokenize import Retokenizer
+from .retokenizer import Retokenizer
 from .span import Span
+from .span_groups import SpanGroups
 from .token import Token
 from .underscore import Underscore
 
-DOCBIN_ALL_ATTRS: Tuple[str, ...]
-
 class DocMethod(Protocol):
     def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ...  # type: ignore[misc]
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 893ba9c2cda..a2501003bb8 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -20,15 +20,8 @@ from thinc.util import copy_array
 
 from .span cimport Span
 from .token cimport MISSING_DEP
-from .span_groups import SpanGroups
-from .token cimport Token
-from ..lexeme cimport Lexeme, EMPTY_LEXEME
-from ..typedefs cimport attr_t, flags_t
-from ..attrs cimport attr_id_t
-from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
-from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 
-from ._dict_proxies import SpanGroups
+from .span_groups import SpanGroups
 
 from ..attrs cimport (
     DEP,
@@ -42,7 +35,6 @@ from ..attrs cimport (
     LENGTH,
     MORPH,
     NORM,
-    ORTH,
     POS,
     SENT_START,
     SPACY,
@@ -50,22 +42,17 @@ from ..attrs cimport (
     attr_id_t,
 )
 from ..lexeme cimport EMPTY_LEXEME, Lexeme
-from ..typedefs cimport attr_t
+from ..typedefs cimport attr_t, flags_t
 from .token cimport Token
 
 from .. import parts_of_speech, schemas, util
 from ..attrs import IDS, intify_attr
-from ..compat import copy_reg
+from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
 from ..morphology import Morphology
-from .. import util
-from .. import parts_of_speech
-from .. import schemas
-from .underscore import Underscore, get_ext_args
-from .retokenizer import Retokenizer
-from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from ..util import get_words_and_spaces
-from ._retokenize import Retokenizer
+from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
+from .retokenizer import Retokenizer
 from .underscore import Underscore, get_ext_args
 
 DEF PADDING = 5
diff --git a/spacy/tokens/doc_bin.py b/spacy/tokens/doc_bin.py
index 8a08864d46e..4dda40a05ee 100644
--- a/spacy/tokens/doc_bin.py
+++ b/spacy/tokens/doc_bin.py
@@ -10,7 +10,9 @@
 from ..attrs import IDS, ORTH, SPACY, intify_attr
 from ..compat import copy_reg
 from ..errors import Errors
-from ..util import ensure_path, SimpleFrozenList
+from ..util import SimpleFrozenList, ensure_path
+from ..vocab import Vocab
+from .doc import Doc
 from .span_groups import SpanGroups
 
 # fmt: off
diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx
index 22ce18181a7..7ded04500a3 100644
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@@ -1,10 +1,9 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
-# cython: profile=False
 from typing import Generator, List, Tuple
 
 cimport cython
 from cython.operator cimport dereference
-from libc.stdint cimport int32_t
+from libc.stdint cimport int32_t, int64_t
 from libcpp.pair cimport pair
 from libcpp.unordered_map cimport unordered_map
 from libcpp.unordered_set cimport unordered_set
@@ -12,12 +11,13 @@ from libcpp.unordered_set cimport unordered_set
 import weakref
 
 from murmurhash.mrmr cimport hash64
+from preshed.maps cimport map_get_unless_missing
 
 from .. import Errors
 
-from ..typedefs cimport hash_t
 from ..strings cimport get_string_id
 from ..structs cimport EdgeC, GraphC
+from ..typedefs cimport hash_t
 
 from .token import Token
 
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
index f866488ecc2..73922c62b9b 100644
--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@@ -1,8 +1,9 @@
-from ..vocab cimport Vocab
-from ..typedefs cimport hash_t
-from ..morphology cimport MorphAnalysisC
 from libcpp.memory cimport shared_ptr
 
+from ..morphology cimport MorphAnalysisC
+from ..typedefs cimport hash_t
+from ..vocab cimport Vocab
+
 
 cdef class MorphAnalysis:
     cdef readonly Vocab vocab
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index ceaa3ecd04e..014c01a2f74 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -1,17 +1,15 @@
-# cython: profile=False
 cimport numpy as np
 from libc.string cimport memset
 
 from ..errors import Errors
 from ..morphology import Morphology
 
-from ..morphology cimport check_feature, get_by_field, list_features
+from cython.operator cimport dereference as deref
+from libcpp.memory cimport shared_ptr
+
+from ..morphology cimport MorphAnalysisC, check_feature, get_by_field, list_features
 from ..typedefs cimport attr_t, hash_t
 from ..vocab cimport Vocab
-from ..typedefs cimport hash_t, attr_t
-from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
-from libcpp.memory cimport shared_ptr
-from cython.operator cimport dereference as deref
 
 
 cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx
index c0052ca9a9a..7b6501d4442 100644
--- a/spacy/tokens/retokenizer.pyx
+++ b/spacy/tokens/retokenizer.pyx
@@ -1,6 +1,7 @@
-# cython: infer_types=True, bounds_check=False
+# cython: infer_types=True, bounds_check=False, profile=True
 from cymem.cymem cimport Pool
-from libc.string cimport memset
+from libc.stdlib cimport free, malloc
+from libc.string cimport memcpy, memset
 
 import numpy
 from thinc.api import get_array_module
@@ -9,12 +10,15 @@ from ..attrs cimport MORPH, NORM
 from ..lexeme cimport EMPTY_LEXEME, Lexeme
 from ..structs cimport LexemeC, TokenC
 from ..vocab cimport Vocab
-from .doc cimport Doc, set_children_from_heads, token_by_start
+from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start
 from .span cimport Span
 from .token cimport Token
 
 from ..attrs import intify_attrs
 from ..errors import Errors
+from ..util import SimpleFrozenDict
+from .underscore import is_writable_attr
+
 from ..strings cimport get_string_id
 
 
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index ce318ed0dfb..fb592e68bd8 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,5 +1,5 @@
-from libcpp.memory cimport shared_ptr
 cimport numpy as np
+from libcpp.memory cimport shared_ptr
 
 from ..structs cimport SpanC
 from ..typedefs cimport attr_t
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 6b7782b788b..c574d86372c 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -3,17 +3,20 @@ cimport numpy as np
 from libc.math cimport sqrt
 from libcpp.memory cimport make_shared
 
+import copy
+import warnings
+
 import numpy
 from thinc.api import get_array_module
 
 from ..attrs cimport *
-from ..attrs cimport ORTH, attr_id_t
+from ..attrs cimport attr_id_t
 from ..lexeme cimport Lexeme
-from ..structs cimport TokenC
+from ..parts_of_speech cimport univ_pos_t
+from ..structs cimport LexemeC, TokenC
 from ..symbols cimport dep
-from ..typedefs cimport attr_t, hash_t
-from .doc cimport _get_lca_matrix, get_token_attr
-from .token cimport Token
+from ..typedefs cimport attr_t, flags_t, hash_t
+from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
 
 from ..errors import Errors, Warnings
 from ..util import normalize_slice
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 8a524926a03..bc5bb92d38c 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -1,17 +1,16 @@
-# cython: profile=False
 import struct
 import weakref
 from copy import deepcopy
-from typing import Iterable, Optional, Union
+from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
 
 import srsly
 
 from spacy.errors import Errors
 
-from .span cimport Span
-from libc.stdint cimport uint64_t, uint32_t, int32_t
 from libcpp.memory cimport make_shared
 
+from .span cimport Span
+
 
 cdef class SpanGroup:
     """A group of spans that all belong to the same Doc object. The group
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 6c4806ff9cb..7e9c1ef4b50 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -2,11 +2,13 @@
 # cython: profile=False
 # Compiler crashes on memory view coercion without this. Should report bug.
 cimport numpy as np
+from cython.view cimport array as cvarray
 
 np.import_array()
 
 import warnings
 
+import numpy
 from thinc.api import get_array_module
 
 from ..attrs cimport (
@@ -27,7 +29,6 @@ from ..attrs cimport (
     LIKE_EMAIL,
     LIKE_NUM,
     LIKE_URL,
-    ORTH,
 )
 from ..lexeme cimport Lexeme
 from ..symbols cimport conj
@@ -39,6 +40,7 @@ from .. import parts_of_speech
 from ..attrs import IOB_STRINGS
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
+
 from cython.operator cimport dereference as deref
 
 
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 358b2bd806d..adfc2bb6658 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,11 +1,9 @@
-from .corpus import Corpus, JsonlCorpus  # noqa: F401
-from .example import Example, validate_examples, validate_get_examples  # noqa: F401
-from .example import validate_distillation_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
 from .callbacks import create_copy_from_base_model  # noqa: F401
 from .corpus import Corpus, JsonlCorpus, PlainTextCorpus  # noqa: F401
+from .example import validate_distillation_examples  # noqa: F401
 from .example import Example, validate_examples, validate_get_examples  # noqa: F401
 from .gold_io import docs_to_json, read_json_file  # noqa: F401
 from .iob_utils import (  # noqa: F401
@@ -19,28 +17,3 @@
     tags_to_entities,
 )
 from .loggers import console_logger  # noqa: F401
-
-__all__ = [
-    "Alignment",
-    "Corpus",
-    "Example",
-    "JsonlCorpus",
-    "PlainTextCorpus",
-    "biluo_tags_to_offsets",
-    "biluo_tags_to_spans",
-    "biluo_to_iob",
-    "create_copy_from_base_model",
-    "docs_to_json",
-    "dont_augment",
-    "iob_to_biluo",
-    "minibatch_by_padded_size",
-    "minibatch_by_words",
-    "offsets_to_biluo_tags",
-    "orth_variants_augmenter",
-    "read_json_file",
-    "remove_bilu_prefix",
-    "split_bilu_label",
-    "tags_to_entities",
-    "validate_get_examples",
-    "validate_examples",
-]
diff --git a/spacy/training/align.pyx b/spacy/training/align.pyx
index c68110e304f..79fec73c411 100644
--- a/spacy/training/align.pyx
+++ b/spacy/training/align.pyx
@@ -1,4 +1,3 @@
-# cython: profile=False
 import re
 from itertools import chain
 from typing import List, Tuple
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index 469bb263016..21f1b29f5a2 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,4 +1,17 @@
 import itertools
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    TypeVar,
+    Union,
+)
+
 from thinc.schedules import Schedule
 
 from ..util import minibatch, registry
diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index 21c3d56a118..c2f3b8b51fa 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -1,11 +1,9 @@
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import Callable, Optional
 
 from ..errors import Errors
+from ..language import Language
 from ..util import load_model, logger, registry
 
-if TYPE_CHECKING:
-    from ..language import Language
-
 
 @registry.callbacks("spacy.copy_from_base_model.v1")
 def create_copy_from_base_model(
diff --git a/spacy/training/converters/json_to_docs.py b/spacy/training/converters/json_to_docs.py
index 1ff7a64e09d..a78c39aea7b 100644
--- a/spacy/training/converters/json_to_docs.py
+++ b/spacy/training/converters/json_to_docs.py
@@ -1,9 +1,13 @@
 import srsly
-from ..gold_io import json_iterate, json_to_annotations
-from ..example import annotations_to_doc
-from ..example import _fix_legacy_dict_data, _parse_example_dict_data
-from ...util import load_model
+
 from ...lang.mul import MultiLanguage
+from ...util import load_model
+from ..example import (
+    _fix_legacy_dict_data,
+    _parse_example_dict_data,
+    annotations_to_doc,
+)
+from ..gold_io import json_iterate, json_to_annotations
 
 
 def json_to_docs(input_data, model=None, **kwargs):
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index b2c93f24bfa..914e877f579 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,5 +1,6 @@
 # cython: profile=False
 from collections.abc import Iterable as IterableInstance
+
 import numpy
 
 from murmurhash.mrmr cimport hash64
diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index afbdf463110..a42e8f6425b 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -1,4 +1,4 @@
-# cython: profile=False
+import json
 import warnings
 
 import srsly
@@ -6,7 +6,7 @@ import srsly
 from .. import util
 from ..errors import Warnings
 from ..tokens import Doc
-from .iob_utils import offsets_to_biluo_tags
+from .iob_utils import offsets_to_biluo_tags, tags_to_entities
 
 
 def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 61ad1c09cc0..781614c34d0 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -1,9 +1,3 @@
-from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING
-from thinc.api import Config, ConfigValidationError
-from pathlib import Path
-import srsly
-import numpy
-import tarfile
 import gzip
 import tarfile
 import warnings
@@ -15,14 +9,27 @@
 import numpy
 import srsly
 import tqdm
-from thinc.api import Config, ConfigValidationError, fix_random_seed, set_gpu_allocator
+from thinc.api import Config, ConfigValidationError
 
 from ..errors import Errors, Warnings
+from ..lookups import Lookups
 from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
-from ..util import registry, load_model_from_config, resolve_dot_names, logger
-from ..util import load_model, ensure_path, get_sourced_components
-from ..util import OOV_RANK, DEFAULT_OOV_PROB
-from ..util import set_gpu_allocator_from_config, set_seed_from_config
+from ..util import (
+    DEFAULT_OOV_PROB,
+    OOV_RANK,
+    ensure_path,
+    get_sourced_components,
+    load_model,
+    load_model_from_config,
+    logger,
+    registry,
+    resolve_dot_names,
+    set_gpu_allocator_from_config,
+    set_seed_from_config,
+)
+from ..vectors import Mode as VectorsMode
+from ..vectors import Vectors
+from .pretrain import get_tok2vec_ref
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index ad162678fec..63715ec2c42 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -3,20 +3,34 @@
 import sys
 from pathlib import Path
 from timeit import default_timer as timer
-from thinc.api import Optimizer, Config, constant
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+
+from thinc.api import Config, Optimizer, constant
 from wasabi import Printer
-import random
-import sys
-import shutil
 
-
-from .example import Example
-from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
+from .. import ty
 from ..errors import Errors
+from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
 from ..tokens.doc import Doc
-from .. import ty
-from ..util import resolve_dot_names, registry, logger
-from ..util import set_gpu_allocator_from_config, set_seed_from_config
+from ..util import (
+    logger,
+    registry,
+    resolve_dot_names,
+    set_gpu_allocator_from_config,
+    set_seed_from_config,
+)
+from .example import Example
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
diff --git a/spacy/ty.py b/spacy/ty.py
index ac09cb336ac..e4f34a5f651 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -1,5 +1,17 @@
-from typing import TYPE_CHECKING, Protocol, runtime_checkable
-from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Protocol,
+    Sequence,
+    runtime_checkable,
+)
+
+from thinc.api import Model, Optimizer
 
 if TYPE_CHECKING:
     from .language import Language
diff --git a/spacy/util.py b/spacy/util.py
index 3bb92e7334c..ae9837e3afe 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -2,13 +2,7 @@
 import importlib
 import importlib.metadata
 import importlib.util
-import re
-from pathlib import Path
-import thinc
-from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
-from thinc.api import ConfigValidationError, Model, constant as constant_schedule
-from thinc.api import fix_random_seed, set_gpu_allocator
-import functools
+import inspect
 import itertools
 import logging
 import os
@@ -55,15 +49,9 @@
 from packaging.requirements import Requirement
 from packaging.specifiers import InvalidSpecifier, SpecifierSet
 from packaging.version import InvalidVersion, Version
-from thinc.api import (
-    Adam,
-    Config,
-    ConfigValidationError,
-    Model,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-)
+from thinc.api import Adam, Config, ConfigValidationError, Model, NumpyOps, Optimizer
+from thinc.api import constant as constant_schedule
+from thinc.api import fix_random_seed, get_current_ops, set_gpu_allocator
 
 try:
     import cupy.random
@@ -71,12 +59,9 @@
     cupy = None
 
 
-from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows
-from .errors import Errors, Warnings
 from . import about
-from .compat import CudaStream, cupy, importlib_metadata, is_windows
-from .errors import OLD_MODEL_SHORTCUTS, Errors, Warnings
+from .compat import CudaStream, cupy, is_windows
+from .errors import Errors, Warnings
 from .symbols import ORTH
 
 if TYPE_CHECKING:
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index e16efd2738d..876c56bed1d 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,15 +1,13 @@
-# cython: infer_types=True, binding=True
-from typing import Callable
-
+cimport numpy as np
 from cython.operator cimport dereference as deref
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.set cimport set as cppset
 from murmurhash.mrmr cimport hash128_x64
 
+import functools
 import warnings
 from enum import Enum
-from pathlib import Path
-from typing import TYPE_CHECKING, Union, cast
+from typing import cast
 
 import numpy
 import srsly
@@ -21,13 +19,9 @@ from .attrs cimport ORTH, attr_id_t
 from .strings cimport StringStore
 
 from . import util
-from .attrs import IDS
 from .errors import Errors, Warnings
 from .strings import get_string_id
 
-if TYPE_CHECKING:
-    from .vocab import Vocab  # noqa: F401  # no-cython-lint
-
 
 def unpickle_vectors(bytes_data):
     return Vectors().from_bytes(bytes_data)
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 3ccfa6db622..3ff7e3d69c4 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -1,5 +1,7 @@
 import functools
 
+import functools
+
 import numpy
 import srsly
 from thinc.api import get_array_module, get_current_ops
@@ -16,6 +18,7 @@ from .errors import Errors
 from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
 from .lang.norm_exceptions import BASE_NORMS
 from .lookups import Lookups
+from .util import registry
 from .vectors import Mode as VectorsMode
 from .vectors import Vectors
 

From 12de17a5d248cf96787b3e0df2f7fd057949d0c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 12:43:21 +0200
Subject: [PATCH 187/504] Fix span <-> underscore import cycle

---
 spacy/tokens/underscore.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index 63706851286..c3e3641d454 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -3,10 +3,10 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 from ..errors import Errors
-from .span import Span
 
 if TYPE_CHECKING:
     from .doc import Doc
+    from .span import Span
     from .token import Token
 
 
@@ -40,7 +40,10 @@ def __init__(
         object.__setattr__(self, "_doc", obj.doc)
         object.__setattr__(self, "_start", start)
         object.__setattr__(self, "_end", end)
-        if type(obj) == Span:
+        # We used to check if obj is a span, however, this introduces an
+        # import cycle between the span and underscore modeles. So we
+        # do a structural type check instead.
+        if hasattr(obj, "id") and hasattr(obj, "label") and hasattr(obj, "kb_id"):
             object.__setattr__(self, "_label", label)
             object.__setattr__(self, "_kb_id", kb_id)
             object.__setattr__(self, "_span_id", span_id)

From 3ebcac097d800151af0ae2b59218da6af1c3dacb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 12:43:45 +0200
Subject: [PATCH 188/504] Fix training.callbacks <-> language import cycle

---
 spacy/training/callbacks.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index c2f3b8b51fa..21c3d56a118 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -1,9 +1,11 @@
-from typing import Callable, Optional
+from typing import TYPE_CHECKING, Callable, Optional
 
 from ..errors import Errors
-from ..language import Language
 from ..util import load_model, logger, registry
 
+if TYPE_CHECKING:
+    from ..language import Language
+
 
 @registry.callbacks("spacy.copy_from_base_model.v1")
 def create_copy_from_base_model(

From ed93040ac43e0dfe99ec06cc7f98421bc6f44eba Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 6 Jul 2023 15:20:13 +0200
Subject: [PATCH 189/504] Disallow False for first/last arguments of add_pipe
 (#12793)

* Literal True for first/last options

* add test case

* update docs

* remove old redundant test case

* black formatting

* use Optional typing in docstrings

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>

---------

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 spacy/errors.py                           |  1 +
 spacy/language.py                         | 20 ++++++++++++--------
 spacy/tests/pipeline/test_pipe_methods.py | 18 ++++++++++++++++--
 website/docs/api/language.mdx             |  7 ++++---
 4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 4909371d549..2ddaef19bca 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -989,6 +989,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
              "{existing_value}.")
     E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
+    E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
 
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
diff --git a/spacy/language.py b/spacy/language.py
index ea641224684..5b2652db53b 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -763,8 +763,8 @@ def add_pipe(
         *,
         before: Optional[Union[str, int]] = None,
         after: Optional[Union[str, int]] = None,
-        first: Optional[bool] = None,
-        last: Optional[bool] = None,
+        first: Optional[Literal[True]] = None,
+        last: Optional[Literal[True]] = None,
         source: Optional["Language"] = None,
         config: Dict[str, Any] = SimpleFrozenDict(),
         raw_config: Optional[Config] = None,
@@ -783,8 +783,8 @@ def add_pipe(
             component directly before.
         after (Union[str, int]): Name or index of the component to insert new
             component directly after.
-        first (bool): If True, insert component first in the pipeline.
-        last (bool): If True, insert component last in the pipeline.
+        first (Optional[Literal[True]]): If True, insert component first in the pipeline.
+        last (Optional[Literal[True]]): If True, insert component last in the pipeline.
         source (Language): Optional loaded nlp object to copy the pipeline
             component from.
         config (Dict[str, Any]): Config parameters to use for this component.
@@ -830,18 +830,22 @@ def _get_pipe_index(
         self,
         before: Optional[Union[str, int]] = None,
         after: Optional[Union[str, int]] = None,
-        first: Optional[bool] = None,
-        last: Optional[bool] = None,
+        first: Optional[Literal[True]] = None,
+        last: Optional[Literal[True]] = None,
     ) -> int:
         """Determine where to insert a pipeline component based on the before/
         after/first/last values.
 
         before (str): Name or index of the component to insert directly before.
         after (str): Name or index of component to insert directly after.
-        first (bool): If True, insert component first in the pipeline.
-        last (bool): If True, insert component last in the pipeline.
+        first (Optional[Literal[True]]): If True, insert component first in the pipeline.
+        last (Optional[Literal[True]]): If True, insert component last in the pipeline.
         RETURNS (int): The index of the new pipeline component.
         """
+        if first is not None and first is not True:
+            raise ValueError(Errors.E4009.format(attr="first", value=first))
+        if last is not None and last is not True:
+            raise ValueError(Errors.E4009.format(attr="last", value=last))
         all_args = {"before": before, "after": after, "first": first, "last": last}
         if sum(arg is not None for arg in [before, after, first, last]) >= 2:
             raise ValueError(
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 39611a74278..063e5bf67fd 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -189,6 +189,22 @@ def test_add_pipe_last(nlp, name1, name2):
     assert nlp.pipeline[-1][0] == name1
 
 
+@pytest.mark.parametrize("name1,name2", [("parser", "lambda_pipe")])
+def test_add_pipe_false(nlp, name1, name2):
+    Language.component("new_pipe2", func=lambda doc: doc)
+    nlp.add_pipe("new_pipe2", name=name2)
+    with pytest.raises(
+        ValueError,
+        match="The 'last' parameter should be 'None' or 'True', but found 'False'",
+    ):
+        nlp.add_pipe("new_pipe", name=name1, last=False)
+    with pytest.raises(
+        ValueError,
+        match="The 'first' parameter should be 'None' or 'True', but found 'False'",
+    ):
+        nlp.add_pipe("new_pipe", name=name1, first=False)
+
+
 def test_cant_add_pipe_first_and_last(nlp):
     with pytest.raises(ValueError):
         nlp.add_pipe("new_pipe", first=True, last=True)
@@ -411,8 +427,6 @@ def test_add_pipe_before_after():
         nlp.add_pipe("entity_ruler", before="ner", after=2)
     with pytest.raises(ValueError):
         nlp.add_pipe("entity_ruler", before=True)
-    with pytest.raises(ValueError):
-        nlp.add_pipe("entity_ruler", first=False)
 
 
 def test_disable_enable_pipes():
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 82cb1c14cef..d65ea376431 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -436,7 +436,8 @@ component factory registered using
 [`@Language.component`](/api/language#component) or
 [`@Language.factory`](/api/language#factory). Components should be callables
 that take a `Doc` object, modify it and return it. Only one of `before`,
-`after`, `first` or `last` can be set. Default behavior is `last=True`.
+`after`, `first` or `last` can be set. The arguments `first` and `last` can
+either be `None` or `True`. Default behavior is `last=True`.
 
 <Infobox title="Changed in v3.0" variant="warning">
 
@@ -471,8 +472,8 @@ component, adds it to the pipeline and returns it.
 | _keyword-only_                        |                                                                                                                                                                                                                                                                                          |
 | `before`                              | Component name or index to insert component directly before. ~~Optional[Union[str, int]]~~                                                                                                                                                                                               |
 | `after`                               | Component name or index to insert component directly after. ~~Optional[Union[str, int]]~~                                                                                                                                                                                                |
-| `first`                               | Insert component first / not first in the pipeline. ~~Optional[bool]~~                                                                                                                                                                                                                   |
-| `last`                                | Insert component last / not last in the pipeline. ~~Optional[bool]~~                                                                                                                                                                                                                     |
+| `first`                               | Insert component first in the pipeline if set to `True`. ~~Optional[Literal[True]]~~                                                                                                                                                                                                     |
+| `last`                                | Insert component last in the pipeline if set to `True`. ~~Optional[Literal[True]]~~                                                                                                                                                                                                      |
 | `config` <Tag variant="new">3</Tag>   | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Dict[str, Any]~~                                                                                                                                    |
 | `source` <Tag variant="new">3</Tag>   | Optional source pipeline to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source pipeline match the target pipeline. ~~Optional[Language]~~ |
 | `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~                                                                                                                                                           |

From 3edd64fd6e2c3d270d46c6f3076a1b459db9594b Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 19 Jul 2023 16:38:29 +0200
Subject: [PATCH 190/504] merge fixes

---
 .../_parser_internals/_beam_utils.pyx         |  4 +-
 spacy/pipeline/morphologizer.pyx              |  1 -
 spacy/pipeline/transition_parser.pyx          | 27 ++++------
 spacy/tests/pipeline/test_tok2vec.py          | 54 +++++++++++++++++++
 .../tests/serialize/test_serialize_config.py  |  1 +
 spacy/tokens/span.pyx                         |  3 +-
 spacy/tokens/token.pyx                        |  2 +-
 spacy/vectors.pyx                             |  2 +-
 8 files changed, 69 insertions(+), 25 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index 7098b822ef0..7c546752d80 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -4,9 +4,7 @@ cimport numpy as np
 
 import numpy
 
-from cpython.ref cimport Py_XDECREF, PyObject
-
-from ...typedefs cimport class_t, hash_t
+from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 7259fc02699..765fd83f111 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -19,7 +19,6 @@ from ..scorer import Scorer
 from ..symbols import POS
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .pipe import deserialize_config
 from .tagger import ActivationsT, Tagger
 
 # See #9050
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index d521aeced7f..8e4bee2b3dd 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -7,15 +7,9 @@ from typing import Dict, Iterable, List, Optional, Tuple
 cimport numpy as np
 from cymem.cymem cimport Pool
 
-from itertools import islice
-
-from libc.stdlib cimport calloc, free
-from libc.string cimport memcpy, memset
-from libcpp.vector cimport vector
-
 import contextlib
 import random
-import warnings
+from itertools import islice
 
 import numpy
 import numpy.random
@@ -24,29 +18,21 @@ from thinc.api import (
     CupyOps,
     NumpyOps,
     Optimizer,
-    chain,
     get_array_module,
     get_ops,
     set_dropout_rate,
-    softmax_activation,
-    use_ops,
 )
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 
 from ..ml.tb_framework import TransitionModelInputs
 
 from ..tokens.doc cimport Doc
-from ._parser_internals cimport _beam_utils
-from ._parser_internals.search cimport Beam
-from ._parser_internals.stateclass cimport StateC, StateClass
-from .trainable_pipe cimport TrainablePipe
-
-from ._parser_internals import _beam_utils
-
 from ..typedefs cimport weight_t
 from ..vocab cimport Vocab
+from ._parser_internals cimport _beam_utils
+from ._parser_internals.stateclass cimport StateC, StateClass
 from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from .trainable_pipe cimport TrainablePipe
 
 from .. import util
 from ..errors import Errors, Warnings
@@ -62,6 +48,11 @@ cdef extern from "<algorithm>" namespace "std" nogil:
     bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
 
 
+
+# TODO: Remove when we switch to Cython 3.
+cdef extern from "<algorithm>" namespace "std" nogil:
+    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
+
 NUMPY_OPS = NumpyOps()
 
 
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index e557e294112..f6cefbc1f84 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -642,3 +642,57 @@ def tok2vec_distill_wrapper(
 
     student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec)
     student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={})
+
+
+def test_tok2vec_listener_source_link_name():
+    """The component's internal name and the tok2vec listener map correspond
+    to the most recently modified pipeline.
+    """
+    orig_config = Config().from_str(cfg_string_multi)
+    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+
+    nlp2 = English()
+    nlp2.add_pipe("tok2vec", source=nlp1)
+    nlp2.add_pipe("tagger", name="tagger2", source=nlp1)
+
+    # there is no way to have the component have the right name for both
+    # pipelines, right now the most recently modified pipeline is prioritized
+    assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2"
+
+    # there is no way to have the tok2vec have the right listener map for both
+    # pipelines, right now the most recently modified pipeline is prioritized
+    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
+    nlp2.add_pipe("ner", name="ner3", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"]
+    nlp2.remove_pipe("ner3")
+    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
+    nlp2.remove_pipe("tagger2")
+    assert nlp2.get_pipe("tok2vec").listening_components == []
+
+    # at this point the tok2vec component corresponds to nlp2
+    assert nlp1.get_pipe("tok2vec").listening_components == []
+
+    # modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1
+    nlp1.add_pipe("sentencizer")
+    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+
+    # modifying nlp2 syncs it back to nlp2
+    nlp2.add_pipe("sentencizer")
+    assert nlp1.get_pipe("tok2vec").listening_components == []
+
+
+def test_tok2vec_listener_source_replace_listeners():
+    orig_config = Config().from_str(cfg_string_multi)
+    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+    nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
+    assert nlp1.get_pipe("tok2vec").listening_components == ["ner"]
+
+    nlp2 = English()
+    nlp2.add_pipe("tok2vec", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == []
+    nlp2.add_pipe("tagger", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == []
+    nlp2.add_pipe("ner", name="ner2", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"]
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 646ce0f5d48..b351ea80121 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -18,6 +18,7 @@
     build_Tok2Vec_model,
 )
 from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
+from spacy.training import Example
 from spacy.util import (
     load_config,
     load_config_from_str,
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index c574d86372c..da93550569e 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -15,8 +15,9 @@ from ..lexeme cimport Lexeme
 from ..parts_of_speech cimport univ_pos_t
 from ..structs cimport LexemeC, TokenC
 from ..symbols cimport dep
-from ..typedefs cimport attr_t, flags_t, hash_t
+from ..typedefs cimport attr_t
 from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
+from .token cimport Token
 
 from ..errors import Errors, Warnings
 from ..util import normalize_slice
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 7e9c1ef4b50..26e571ee802 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -431,7 +431,7 @@ cdef class Token:
         if "vector" in self.doc.user_token_hooks:
             return self.doc.user_token_hooks["vector"](self)
         else:
-            return self.vocab.get_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
+            return self.vocab.get_vector(self.c.lex.orth)
 
     @property
     def vector_norm(self):
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 876c56bed1d..111a9d01e08 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -130,7 +130,7 @@ cdef class Vectors(BaseVectors):
     cdef readonly unicode eow
     cdef readonly attr_id_t attr
 
-    def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
+    def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">", attr="ORTH"):
         """Create a new vector store.
 
         strings (StringStore): The string store.

From 73c3677e8ca9713924ddae9e55cd8414c7afbffb Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 19 Jul 2023 17:41:29 +0200
Subject: [PATCH 191/504] cython fixes and cleanup

---
 spacy/matcher/phrasematcher.pyx               |  2 -
 spacy/ml/tb_framework.pyx                     | 55 ++++++++++---------
 spacy/morphology.pyx                          |  6 +-
 spacy/parts_of_speech.pxd                     |  2 +-
 spacy/pipeline/_parser_internals/ner.pyx      |  1 -
 spacy/pipeline/_parser_internals/search.pxd   |  1 -
 spacy/pipeline/_parser_internals/search.pyx   | 12 ++--
 .../_parser_internals/transition_system.pxd   |  4 +-
 .../_parser_internals/transition_system.pyx   | 21 ++++---
 spacy/pipeline/morphologizer.pyx              |  3 +-
 spacy/pipeline/pipe.pyx                       |  5 +-
 spacy/pipeline/trainable_pipe.pyx             | 17 +++---
 spacy/pipeline/transition_parser.pyx          | 55 ++++++++++---------
 spacy/strings.pyx                             |  9 +--
 spacy/tests/parser/_search.pyx                | 49 +++++++++--------
 spacy/tokens/doc.pyx                          |  2 +-
 spacy/tokens/morphanalysis.pyx                |  1 -
 spacy/tokens/span.pyx                         |  3 +-
 18 files changed, 119 insertions(+), 129 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 107d7d926ee..d1a8eaf33c4 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -160,7 +160,6 @@ cdef class PhraseMatcher:
         del self._callbacks[key]
         del self._docs[key]
 
-
     def _add_from_arrays(self, key, specs, *, on_match=None):
         """Add a preprocessed list of specs, with an optional callback.
 
@@ -196,7 +195,6 @@ cdef class PhraseMatcher:
                 result = internal_node
             map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
 
-
     def add(self, key, docs, *, on_match=None):
         """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
         key, a list of one or more patterns, and (optionally) an on_match callback.
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index fd0af12ceab..ed04045a6a5 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -1,5 +1,5 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, TypeVar, cast
+from typing import Any, List, Optional, Tuple, cast
 
 from libc.stdlib cimport calloc, free, realloc
 from libc.string cimport memcpy, memset
@@ -23,7 +23,7 @@ from thinc.api import (
 
 from thinc.backends.cblas cimport CBlas, saxpy, sgemm
 
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
+from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
 
 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
@@ -136,7 +136,7 @@ def init(
     Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
 ):
     if X is not None:
-        docs, moves = X
+        docs, _ = X
         model.get_ref("tok2vec").initialize(X=docs)
     else:
         model.get_ref("tok2vec").initialize()
@@ -145,7 +145,7 @@ def init(
         current_nO = model.maybe_get_dim("nO")
         if current_nO is None or current_nO != inferred_nO:
             model.attrs["resize_output"](model, inferred_nO)
-    nO = model.get_dim("nO")
+    # nO = model.get_dim("nO")
     nP = model.get_dim("nP")
     nH = model.get_dim("nH")
     nI = model.get_dim("nI")
@@ -192,9 +192,10 @@ class TransitionModelInputs:
         self,
         docs: List[Doc],
         moves: TransitionSystem,
-        actions: Optional[List[Ints1d]]=None,
-        max_moves: int=0,
-        states: Optional[List[State]]=None):
+        actions: Optional[List[Ints1d]] = None,
+        max_moves: int = 0,
+        states: Optional[List[State]] = None,
+    ):
         """
         actions (Optional[List[Ints1d]]): actions to apply for each Doc.
         docs (List[Doc]): Docs to predict transition sequences for.
@@ -234,12 +235,12 @@ def forward(model, inputs: TransitionModelInputs, is_train: bool):
         return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
     else:
         return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
-            feats, backprop_feats, seen_mask, is_train, actions=actions,
-            max_moves=inputs.max_moves)
+                                 feats, backprop_feats, seen_mask, is_train, actions=actions,
+                                 max_moves=inputs.max_moves)
 
 
 def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
-                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+                        np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None):
     cdef vector[StateC*] c_states
     cdef StateClass state
     for state in states:
@@ -257,9 +258,10 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State
 
     return (states, scores), backprop
 
+
 cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
                        WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i, j
+    cdef int i
     cdef vector[StateC *] unfinished
     cdef ActivationsC activations = _alloc_activations(sizes)
     cdef np.ndarray step_scores
@@ -276,7 +278,7 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
             if actions is None:
                 # Validate actions, argmax, take action.
                 c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
-                    sizes.states)
+                                   sizes.states)
             else:
                 c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
             for i in range(sizes.states):
@@ -302,8 +304,8 @@ def _forward_fallback(
     backprop_feats,
     seen_mask,
     is_train: bool,
-    actions: Optional[List[Ints1d]]=None,
-    max_moves: int=0):
+    actions: Optional[List[Ints1d]] = None,
+        max_moves: int = 0):
     nF = model.get_dim("nF")
     output = model.get_ref("output")
     hidden_b = model.get_param("hidden_b")
@@ -371,7 +373,7 @@ def _forward_fallback(
             for clas in set(model.attrs["unseen_classes"]):
                 if (d_scores[:, clas] < 0).any():
                     model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False
+        d_scores *= seen_mask == False  # no-cython-lint
         # Calculate the gradients for the parameters of the output layer.
         # The weight gemm is (nS, nO) @ (nS, nH).T
         output.inc_grad("b", d_scores.sum(axis=0))
@@ -571,13 +573,13 @@ cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
         A._max_size = n.states
     else:
         A.token_ids = <int*>realloc(A.token_ids,
-            n.states * n.feats * sizeof(A.token_ids[0]))
+                                    n.states * n.feats * sizeof(A.token_ids[0]))
         A.unmaxed = <float*>realloc(A.unmaxed,
-            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
         A.hiddens = <float*>realloc(A.hiddens,
-            n.states * n.hiddens * sizeof(A.hiddens[0]))
+                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
         A.is_valid = <int*>realloc(A.is_valid,
-            n.states * n.classes * sizeof(A.is_valid[0]))
+                                   n.states * n.classes * sizeof(A.is_valid[0]))
         A._max_size = n.states
     A._curr_size = n.states
 
@@ -599,9 +601,9 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
     else:
         # Compute hidden-to-output
         sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-                      1.0, <const float *>A.hiddens, n.hiddens,
-                      <const float *>W.hidden_weights, n.hiddens,
-                      0.0, scores, n.classes)
+                     1.0, <const float *>A.hiddens, n.hiddens,
+                     <const float *>W.hidden_weights, n.hiddens,
+                     0.0, scores, n.classes)
         # Add bias
         for i in range(n.states):
             saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
@@ -617,12 +619,12 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
                 scores[i*n.classes+j] = min_
 
 
-cdef void _sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f, i
+cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
+                              const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f
     cdef const float* feature
     cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces
+    cdef int O = n.hiddens * n.pieces  # no-cython-lint
     cdef int F = n.feats
     cdef int T = n.tokens
     padding = cached + (T * F * O)
@@ -637,4 +639,3 @@ cdef void _sum_state_features(CBlas cblas, float* output,
                 feature = &cached[idx]
             saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
         token_ids += F
-
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index d75c1071941..e7f93b78b47 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -80,15 +80,13 @@ cdef class Morphology:
         out.sort(key=lambda x: x[0])
         return dict(out)
 
-
     def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
         norm_feats_string = self.FEATURE_SEP.join([
-                self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
+            self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
             for field, values in feats.items()
-        ])
+            ])
         return norm_feats_string or self.EMPTY_MORPH
 
-
     cdef hash_t _add(self, features):
         """Insert a morphological analysis in the morphology table, if not
         already present. The morphological analysis may be provided in the UD
diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index 01f116ea688..22a571be7b0 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -8,7 +8,7 @@ cpdef enum univ_pos_t:
     ADV = symbols.ADV
     AUX = symbols.AUX
     CONJ = symbols.CONJ
-    CCONJ  = symbols.CCONJ  # U20
+    CCONJ = symbols.CCONJ  # U20
     DET = symbols.DET
     INTJ = symbols.INTJ
     NOUN = symbols.NOUN
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 5c31ff5c21d..3a352f51ff5 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -2,7 +2,6 @@ import os
 import random
 
 from cymem.cymem cimport Pool
-from libc.stdint cimport int32_t
 from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
 
diff --git a/spacy/pipeline/_parser_internals/search.pxd b/spacy/pipeline/_parser_internals/search.pxd
index 4626496335a..ad68dc5c718 100644
--- a/spacy/pipeline/_parser_internals/search.pxd
+++ b/spacy/pipeline/_parser_internals/search.pxd
@@ -57,7 +57,6 @@ cdef class Beam:
     cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
                      void* extra_args) except -1
     cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
- 
 
     cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
         self.scores[i][j] = score
diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
index 251eaa805cb..578299b56ae 100644
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -1,11 +1,8 @@
 # cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
-from libc.math cimport exp, log
-from libc.string cimport memcpy, memset
-
-import math
-
 from cymem.cymem cimport Pool
+from libc.math cimport exp
+from libc.string cimport memcpy, memset
 from preshed.maps cimport PreshMap
 
 
@@ -70,7 +67,7 @@ cdef class Beam:
             self.costs[i][j] = costs[j]
 
     cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
-        cdef int i, j
+        cdef int i
         for i in range(self.width):
             memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
             memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
@@ -176,7 +173,6 @@ cdef class Beam:
         beam-width, and n is the number of classes.
         """
         cdef Entry entry
-        cdef weight_t score
         cdef _State* s
         cdef int i, j, move_id
         assert self.size >= 1
@@ -269,7 +265,7 @@ cdef class MaxViolation:
                 # This can happen from non-monotonic actions
                 # If we find a better gold analysis this way, be sure to keep it.
                 elif pred._states[i].loss <= 0 \
-                and tuple(pred.histories[i]) not in seen_golds:
+                        and tuple(pred.histories[i]) not in seen_golds:
                     g_scores.append(pred._states[i].score)
                     g_hist.append(list(pred.histories[i]))
             for i in range(gold.size):
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 66cc7747b69..08baed932ba 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -60,7 +60,7 @@ cdef class TransitionSystem:
 
 
 cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-    int batch_size) nogil
+                          int batch_size) nogil
 
 cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-        int nr_class, int batch_size) nogil
+                             int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index a433ce7dc75..50b155bf9bb 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -294,19 +294,19 @@ cdef class TransitionSystem:
 
 
 cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-    int batch_size) nogil:
-        cdef int i
-        cdef Transition action
-        cdef StateC* state
-        for i in range(batch_size):
-            state = states[i]
-            action = moves.c[actions[i]]
-            action.do(state, action.label)
-            state.history.push_back(action.clas)
+                          int batch_size) nogil:
+    cdef int i
+    cdef Transition action
+    cdef StateC* state
+    for i in range(batch_size):
+        state = states[i]
+        action = moves.c[actions[i]]
+        action.do(state, action.label)
+        state.history.push_back(action.clas)
 
 
 cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-    int nr_class, int batch_size) nogil:
+                             int nr_class, int batch_size) nogil:
     is_valid = <int*>calloc(moves.n_moves, sizeof(int))
     cdef int i, guess
     cdef Transition action
@@ -322,4 +322,3 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa
             action.do(states[i], action.label)
             states[i].history.push_back(guess)
     free(is_valid)
-
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 765fd83f111..669a5424412 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,8 +1,7 @@
 # cython: infer_types=True, profile=True, binding=True
 from itertools import islice
-from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Callable, Dict, Iterable, Optional, Union
 
-import srsly
 from thinc.api import Config, Model
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 7bc6735a802..8409e64c3cb 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -1,12 +1,11 @@
 # cython: infer_types=True, profile=True, binding=True
-import warnings
-from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union
+from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
 
 import srsly
 
 from ..tokens.doc cimport Doc
 
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..language import Language
 from ..training import Example
 from ..util import raise_error
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index e7cf566a113..065a6c20d62 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -1,5 +1,4 @@
 # cython: infer_types=True, profile=True, binding=True
-import warnings
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 
 import srsly
@@ -8,7 +7,7 @@ from thinc.api import Model, Optimizer, set_dropout_rate
 from ..tokens.doc cimport Doc
 
 from .. import util
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..language import Language
 from ..training import Example, validate_distillation_examples, validate_examples
 from ..vocab import Vocab
@@ -56,14 +55,14 @@ cdef class TrainablePipe(Pipe):
         except Exception as e:
             error_handler(self.name, self, [doc], e)
 
-
     def distill(self,
-               teacher_pipe: Optional["TrainablePipe"],
-               examples: Iterable["Example"],
-               *,
-               drop: float=0.0,
-               sgd: Optional[Optimizer]=None,
-               losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
+                teacher_pipe: Optional["TrainablePipe"],
+                examples: Iterable["Example"],
+                *,
+                drop: float = 0.0,
+                sgd: Optional[Optimizer] = None,
+                losses: Optional[Dict[str, float]] = None
+                ) -> Dict[str, float]:
         """Train a pipe (the student) on the predictions of another pipe
         (the teacher). The student is typically trained on the probability
         distribution of the teacher, but details may differ per pipe.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 8e4bee2b3dd..9fa0d4987b8 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -228,12 +228,13 @@ class Parser(TrainablePipe):
         raise NotImplementedError
 
     def distill(self,
-               teacher_pipe: Optional[TrainablePipe],
-               examples: Iterable["Example"],
-               *,
-               drop: float=0.0,
-               sgd: Optional[Optimizer]=None,
-               losses: Optional[Dict[str, float]]=None):
+                teacher_pipe: Optional[TrainablePipe],
+                examples: Iterable["Example"],
+                *,
+                drop: float = 0.0,
+                sgd: Optional[Optimizer] = None,
+                losses: Optional[Dict[str, float]] = None
+                ):
         """Train a pipe (the student) on the predictions of another pipe
         (the teacher). The student is trained on the transition probabilities
         of the teacher.
@@ -283,11 +284,13 @@ class Parser(TrainablePipe):
         # teacher's distributions.
 
         student_inputs = TransitionModelInputs(docs=student_docs,
-            states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
+                                               states=[state.copy() for state in states],
+                                               moves=self.moves,
+                                               max_moves=max_moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
         actions = _states_diff_to_actions(states, student_states)
         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-            states=states, moves=teacher_pipe.moves, actions=actions)
+                                               states=states, moves=teacher_pipe.moves, actions=actions)
         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@@ -300,10 +303,9 @@ class Parser(TrainablePipe):
 
         return losses
 
-
     def get_teacher_student_loss(
-        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
-        normalize: bool=False,
+            self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
+            normalize: bool = False,
     ) -> Tuple[float, List[Floats2d]]:
         """Calculate the loss and its gradient for a batch of student
         scores, relative to teacher scores.
@@ -326,9 +328,9 @@ class Parser(TrainablePipe):
         # ourselves.
 
         teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
-            axis=-1, inplace=True)
+                                                axis=-1, inplace=True)
         student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
-            axis=-1, inplace=True)
+                                                axis=-1, inplace=True)
 
         assert teacher_scores.shape == student_scores.shape
 
@@ -442,13 +444,15 @@ class Parser(TrainablePipe):
         else:
             init_states, gold_states, _ = self.moves.init_gold_batch(examples)
 
-        inputs = TransitionModelInputs(docs=docs, moves=self.moves,
-            max_moves=max_moves, states=[state.copy() for state in init_states])
+        inputs = TransitionModelInputs(docs=docs,
+                                       moves=self.moves,
+                                       max_moves=max_moves,
+                                       states=[state.copy() for state in init_states])
         (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
         if sum(s.shape[0] for s in scores) == 0:
             return losses
         d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
-            examples, max_moves)
+                                 examples, max_moves)
         backprop_scores((pred_states, d_scores))
         if sgd not in (None, False):
             self.finish_update(sgd)
@@ -489,9 +493,7 @@ class Parser(TrainablePipe):
         cdef TransitionSystem moves = self.moves
         cdef StateClass state
         cdef int clas
-        cdef int nF = self.model.get_dim("nF")
         cdef int nO = moves.n_moves
-        cdef int nS = sum([len(history) for history in histories])
         cdef Pool mem = Pool()
         cdef np.ndarray costs_i
         is_valid = <int*>mem.alloc(nO, sizeof(int))
@@ -558,8 +560,8 @@ class Parser(TrainablePipe):
 
         return losses
 
-    def update_beam(self, examples, *, beam_width,
-            drop=0., sgd=None, losses=None, beam_density=0.0):
+    def update_beam(self, examples, *, beam_width, drop=0.,
+                    sgd=None, losses=None, beam_density=0.0):
         raise NotImplementedError
 
     def set_output(self, nO):
@@ -684,9 +686,10 @@ class Parser(TrainablePipe):
             return states
 
         # Parse the states that are too long with the teacher's parsing model.
-        teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
-            states=[state.copy() for state in to_cut])
-        (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs)
+        teacher_inputs = TransitionModelInputs(docs=docs,
+                                               moves=moves,
+                                               states=[state.copy() for state in to_cut])
+        (teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)
 
         # Step through the teacher's actions and store every state after
         # each multiple of max_length.
@@ -784,6 +787,7 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
 
     return actions
 
+
 def _states_diff_to_actions(
     before_states: List[StateClass],
     after_states: List[StateClass]
@@ -804,8 +808,9 @@ def _states_diff_to_actions(
         c_state_before = before_state.c
         c_state_after = after_state.c
 
-        assert equal(c_state_before.history.begin(), c_state_before.history.end(),
-            c_state_after.history.begin())
+        assert equal(c_state_before.history.begin(),
+                     c_state_before.history.end(),
+                     c_state_after.history.begin())
 
     actions = []
     while True:
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 43826f07c44..28e06a2ecea 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,7 +1,6 @@
 # cython: infer_types=True
-from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Iterable, Iterator, List, Optional, Tuple, Union
 
-cimport cython
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from libcpp.set cimport set
@@ -244,7 +243,6 @@ cdef class StringStore:
         cdef int n_length_bytes
         cdef int i
         cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
-        cdef uint32_t ulength = length
         if length < sizeof(string.s):
             string.s[0] = <unsigned char>length
             memcpy(&string.s[1], chars, length)
@@ -302,7 +300,7 @@ cpdef hash_t get_string_id(object string_or_hash) except -1:
 
     try:
         return hash_string(string_or_hash)
-    except:
+    except:   # no-cython-lint
         if _try_coerce_to_hash(string_or_hash, &str_hash):
             # Coerce the integral key to the expected primitive hash type.
             # This ensures that custom/overloaded "primitive" data types
@@ -319,6 +317,5 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
     try:
         out_hash[0] = key
         return True
-    except:
+    except:  # no-cython-lint
         return False
-
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
index 0983159b75d..cd9e6b2f5ee 100644
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@@ -2,7 +2,7 @@
 from cymem.cymem cimport Pool
 
 from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
-from spacy.typedefs cimport class_t, weight_t
+from spacy.typedefs cimport class_t
 
 import pytest
 
@@ -42,32 +42,35 @@ cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
     state = <TestState*>state
     mem.free(state)
 
+
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width",
-    [
-        (2, 3),
-        (3, 6),
-        (4, 20),
-    ]
-)
+                         [
+                             (2, 3),
+                             (3, 6),
+                             (4, 20),
+                         ]
+                         )
 def test_init(nr_class, beam_width):
     b = Beam(nr_class, beam_width)
     assert b.size == 1
     assert b.width == beam_width
     assert b.nr_class == nr_class
 
+
 @cytest
 def test_init_violn():
     MaxViolation()
 
+
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length",
-    [
-        (2, 3, 3),
-        (3, 6, 15),
-        (4, 20, 32),
-    ]
-)
+                         [
+                             (2, 3, 3),
+                             (3, 6, 15),
+                             (4, 20, 32),
+                         ]
+                         )
 def test_initialize(nr_class, beam_width, length):
     b = Beam(nr_class, beam_width)
     b.initialize(initialize, destroy, length, NULL)
@@ -79,11 +82,11 @@ def test_initialize(nr_class, beam_width, length):
 
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length,extra",
-    [
-        (2, 3, 4, None),
-        (3, 6, 15, u"test beam 1"),
-    ]
-)
+                         [
+                             (2, 3, 4, None),
+                             (3, 6, 15, u"test beam 1"),
+                         ]
+                         )
 def test_initialize_extra(nr_class, beam_width, length, extra):
     b = Beam(nr_class, beam_width)
     if extra is None:
@@ -97,11 +100,11 @@ def test_initialize_extra(nr_class, beam_width, length, extra):
 
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length",
-    [
-        (3, 6, 15),
-        (4, 20, 32),
-    ]
-)
+                         [
+                             (3, 6, 15),
+                             (4, 20, 32),
+                         ]
+                         )
 def test_transition(nr_class, beam_width, length):
     b = Beam(nr_class, beam_width)
     b.initialize(initialize, destroy, length, NULL)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index a2501003bb8..5a70af00e2e 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1755,7 +1755,7 @@ cdef class Doc:
                                     data["underscore_span"] = {}
                                 if attr not in data["underscore_span"]:
                                     data["underscore_span"][attr] = []
-                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id":_span_id})
+                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id": _span_id})
 
             for attr in underscore:
                 if attr not in user_keys:
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index 014c01a2f74..f3841baa24a 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -1,5 +1,4 @@
 cimport numpy as np
-from libc.string cimport memset
 
 from ..errors import Errors
 from ..morphology import Morphology
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index da93550569e..8e490ec83d0 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -242,8 +242,8 @@ cdef class Span:
 
     @property
     def _(self):
-        cdef SpanC* span_c = self.span_c()
         """Custom extension attributes registered via `set_extension`."""
+        cdef SpanC* span_c = self.span_c()
         return Underscore(Underscore.span_extensions, self,
                           start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
 
@@ -950,7 +950,6 @@ cdef class Span:
             self.id_ = ent_id_
 
 
-
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
     # better candidates

From 70adab2881747e576b5dbced4361349b0019b2bb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 Jul 2023 09:59:19 +0200
Subject: [PATCH 192/504] Update spacy/ml/tb_framework.pyx

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 spacy/ml/tb_framework.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index ed04045a6a5..a48c6b901c7 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -305,7 +305,8 @@ def _forward_fallback(
     seen_mask,
     is_train: bool,
     actions: Optional[List[Ints1d]] = None,
-        max_moves: int = 0):
+    max_moves: int = 0,
+):
     nF = model.get_dim("nF")
     output = model.get_ref("output")
     hidden_b = model.get_param("hidden_b")

From 36e5c8f49255bd10dd3f4f84af045cbbaf046aba Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 20 Jul 2023 14:08:29 +0200
Subject: [PATCH 193/504] remove unnecessary line

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/ml/tb_framework.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index a48c6b901c7..6c5c29d8549 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -145,7 +145,6 @@ def init(
         current_nO = model.maybe_get_dim("nO")
         if current_nO is None or current_nO != inferred_nO:
             model.attrs["resize_output"](model, inferred_nO)
-    # nO = model.get_dim("nO")
     nP = model.get_dim("nP")
     nH = model.get_dim("nH")
     nI = model.get_dim("nI")

From bd3802d1e134652301209c6e2bc8de37dd6bbe7d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 31 Jul 2023 15:54:35 +0200
Subject: [PATCH 194/504] Recommend lookups tables from URLs or other loaders
 (#12283)

* Recommend lookups tables from URLs or other loaders

Shift away from the `lookups` extra (which isn't removed, just no longer
mentioned) and recommend loading data from the `spacy-lookups-data` repo
or other sources rather than the `spacy-lookups-data` package.

If the tables can't be loaded from the `lookups` registry in the
lemmatizer, show how to specify the tables in `[initialize]` rather than
recommending the `spacy-lookups-data` package.

* Add tests for some rule-based lemmatizers

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/about.py                             |  4 ++
 spacy/errors.py                            | 25 ++++++++---
 spacy/language.py                          |  7 ----
 spacy/lookups.py                           | 26 +++++++++++-
 spacy/pipeline/lemmatizer.py               | 21 +++++++++-
 spacy/tests/pipeline/test_lemmatizer.py    | 16 ++++++-
 website/docs/api/lemmatizer.mdx            |  4 +-
 website/docs/api/top-level.mdx             | 49 ++++++++++++++++++++++
 website/docs/usage/index.mdx               |  7 ++--
 website/docs/usage/linguistic-features.mdx |  6 +--
 website/src/widgets/quickstart-install.js  |  4 --
 11 files changed, 141 insertions(+), 28 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index ec1dde7cae6..73f201af5fb 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -3,3 +3,7 @@
 __version__ = "4.0.0.dev1"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
+__projects__ = "https://github.com/explosion/projects"
+__projects_branch__ = "v3"
+__lookups_tag__ = "v1.0.3"
+__lookups_url__ = f"https://raw.githubusercontent.com/explosion/spacy-lookups-data/{__lookups_tag__}/spacy_lookups_data/data/"
diff --git a/spacy/errors.py b/spacy/errors.py
index 2ddaef19bca..adca5880283 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,6 +1,8 @@
 import warnings
 from typing import Literal
 
+from . import about
+
 
 class ErrorsWithCodes(type):
     def __getattribute__(self, code):
@@ -103,13 +105,14 @@ class Warnings(metaclass=ErrorsWithCodes):
             "table. This may degrade the performance of the model to some "
             "degree. If this is intentional or the language you're using "
             "doesn't have a normalization table, please ignore this warning. "
-            "If this is surprising, make sure you have the spacy-lookups-data "
-            "package installed and load the table in your config. The "
-            "languages with lexeme normalization tables are currently: "
-            "{langs}\n\nLoad the table in your config with:\n\n"
+            "If this is surprising, make sure you are loading the table in "
+            "your config. The languages with lexeme normalization tables are "
+            "currently: {langs}\n\nAn example of how to load a table in "
+            "your config :\n\n"
             "[initialize.lookups]\n"
-            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
+            "@misc = \"spacy.LookupsDataLoaderFromURL.v1\"\n"
             "lang = ${{nlp.lang}}\n"
+             f'url = "{about.__lookups_url__}"\n'
             "tables = [\"lexeme_norm\"]\n")
     W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
             "attribute or operator.")
@@ -990,6 +993,18 @@ class Errors(metaclass=ErrorsWithCodes):
              "{existing_value}.")
     E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
     E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
+    E4010 = ("Required lemmatizer table(s) {missing_tables} not found in "
+             "[initialize] or in registered lookups (spacy-lookups-data). An "
+             "example for how to load lemmatizer tables in [initialize]:\n\n"
+             "[initialize.components]\n\n"
+             "[initialize.components.{pipe_name}]\n\n"
+             "[initialize.components.{pipe_name}.lookups]\n"
+             '@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
+             "lang = ${{nlp.lang}}\n"
+             f'url = "{about.__lookups_url__}"\n'
+             "tables = {tables}\n"
+             "# or required tables only: tables = {required_tables}\n")
+    E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
diff --git a/spacy/language.py b/spacy/language.py
index 5b2652db53b..72d27c598cc 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -128,13 +128,6 @@ def tokenizer_factory(nlp: "Language") -> Tokenizer:
     return tokenizer_factory
 
 
-@registry.misc("spacy.LookupsDataLoader.v1")
-def load_lookups_data(lang, tables):
-    util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
-    lookups = load_lookups(lang=lang, tables=tables)
-    return lookups
-
-
 class Language:
     """A text-processing pipeline. Usually you'll load this once per process,
     and pass the instance around your application.
diff --git a/spacy/lookups.py b/spacy/lookups.py
index 1a2c44bfa1c..e2e92426f6a 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -2,16 +2,40 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
+import requests
 import srsly
 from preshed.bloom import BloomFilter
 
 from .errors import Errors
 from .strings import get_string_id
-from .util import SimpleFrozenDict, ensure_path, load_language_data, registry
+from .util import SimpleFrozenDict, ensure_path, load_language_data, logger, registry
 
 UNSET = object()
 
 
+@registry.misc("spacy.LookupsDataLoader.v1")
+def load_lookups_data(lang, tables):
+    logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
+    lookups = load_lookups(lang=lang, tables=tables)
+    return lookups
+
+
+@registry.misc("spacy.LookupsDataLoaderFromURL.v1")
+def load_lookups_data_from_url(lang, tables, url):
+    logger.debug(f"Loading lookups from {url}: {tables}")
+    lookups = Lookups()
+    for table in tables:
+        table_url = url + lang + "_" + table + ".json"
+        r = requests.get(table_url)
+        if r.status_code != 200:
+            raise ValueError(
+                Errors.E4011.format(status_code=r.status_code, url=table_url)
+            )
+        table_data = r.json()
+        lookups.add_table(table, table_data)
+    return lookups
+
+
 def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
     """Load the data from the spacy-lookups-data package for a given language,
     if available. Returns an empty `Lookups` container if there's no data or if the package
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 09e501595a8..ed9547c745b 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
+import srsly
 from thinc.api import Model
 
 from .. import util
@@ -155,8 +156,24 @@ def initialize(
         """
         required_tables, optional_tables = self.get_lookups_config(self.mode)
         if lookups is None:
-            logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
-            lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
+            logger.debug(
+                "Lemmatizer: no lemmatizer lookups tables provided, "
+                "trying to load tables from registered lookups (usually "
+                "spacy-lookups-data)"
+            )
+            lookups = load_lookups(
+                lang=self.vocab.lang, tables=required_tables, strict=False
+            )
+            missing_tables = set(required_tables) - set(lookups.tables)
+            if len(missing_tables) > 0:
+                raise ValueError(
+                    Errors.E4010.format(
+                        missing_tables=list(missing_tables),
+                        pipe_name=self.name,
+                        required_tables=srsly.json_dumps(required_tables),
+                        tables=srsly.json_dumps(required_tables + optional_tables),
+                    )
+                )
             optional_lookups = load_lookups(
                 lang=self.vocab.lang, tables=optional_tables, strict=False
             )
diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py
index ccc2e0b154a..5385fb5d7dd 100644
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@@ -2,9 +2,11 @@
 
 import pytest
 
+import spacy
 from spacy import registry, util
+from spacy.about import __lookups_url__
 from spacy.lang.en import English
-from spacy.lookups import Lookups
+from spacy.lookups import Lookups, load_lookups_data_from_url
 
 from ..util import make_tempdir
 
@@ -113,3 +115,15 @@ def cope_lookups():
 
     # Make sure that lemmatizer cache can be pickled
     pickle.dumps(lemmatizer2)
+
+
+@pytest.mark.parametrize("lang", ("ca", "en"))
+def test_lemmatizer_load_lookups_from_url(lang):
+    nlp = spacy.blank(lang)
+    lemmatizer = nlp.add_pipe("lemmatizer")
+    req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
+    lookups = load_lookups_data_from_url(
+        nlp.lang, req_tables + opt_tables, __lookups_url__
+    )
+    lemmatizer.initialize(lookups=lookups)
+    assert set(lemmatizer.lookups.tables) == set(req_tables + opt_tables)
diff --git a/website/docs/api/lemmatizer.mdx b/website/docs/api/lemmatizer.mdx
index f6657dbf48c..5bd0112e237 100644
--- a/website/docs/api/lemmatizer.mdx
+++ b/website/docs/api/lemmatizer.mdx
@@ -14,7 +14,7 @@ implement their own lemmatizer components via
 [language-specific factories](/usage/processing-pipelines#factories-language).
 The default data used is provided by the
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
-extension package.
+repository.
 
 For a trainable lemmatizer, see [`EditTreeLemmatizer`](/api/edittreelemmatizer).
 
@@ -174,6 +174,8 @@ training. At runtime, all data is loaded from disk.
 >
 > ```python
 > lemmatizer = nlp.add_pipe("lemmatizer")
+> req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
+> lookups = load_lookups(nlp.lang, req_tables + opt_tables)
 > lemmatizer.initialize(lookups=lookups)
 > ```
 >
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 8555d64ba63..a2d4bbdd387 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -9,6 +9,7 @@ menu:
   - ['Batchers', 'batchers']
   - ['Augmenters', 'augmenters']
   - ['Callbacks', 'callbacks']
+  - ['Miscellaneous', 'misc']
   - ['Training & Alignment', 'gold']
   - ['Utility Functions', 'util']
 ---
@@ -1058,6 +1059,54 @@ methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`,
 | `additional_pipe_functions` | Additional pipeline methods to wrap. Keys are pipeline names and values are lists of method identifiers. Defaults to `None`. ~~Optional[Dict[str, List[str]]]~~ |
 | **CREATES**                 | A function that takes the current `nlp` and wraps pipe models and methods in NVTX ranges. ~~Callable[[Language], Language]~~                                    |
 
+## Miscellaneous {id="misc"}
+
+### spacy.LookupsDataLoader.v1 {id="lookups_data_reader",tag="registered function",version="3"}
+
+> #### Example config
+>
+> ```ini
+> [initialize.lookups]
+> @misc = "spacy.LookupsDataLoader.v1"
+> lang = ${nlp.lang}
+> tables = ["lexeme_prob"]
+> ```
+
+Load the specified tables from the [`lookups` registry](#registry), which are
+provided by a package such as
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
+
+| Name        | Description                                                                                     |
+| ----------- | ----------------------------------------------------------------------------------------------- |
+| `lang`      | The language. ~~str~~                                                                           |
+| `tables`    | The tables to load. ~~List[str]~~                                                               |
+| **CREATES** | A function that loads the specified tables from the lookups registry. ~~Callable[[], Lookups]~~ |
+
+### spacy.LookupsDataLoaderFromURL.v1 {id="lookups_data_reader_from_url",tag="registered function",version="4"}
+
+> #### Example config
+>
+> ```ini
+> [initialize.components.lemmatizer.lookups]
+> @misc = "spacy.LookupsDataLoaderFromURL.v1"
+> lang = ${nlp.lang}
+> url = "https://raw.githubusercontent.com/explosion/spacy-lookups-data/v1.0.3/spacy_lookups_data/data/"
+> tables = ["lemma_rules","lemma_exc","lemma_index"]
+> ```
+
+Load the specified tables from the provided URL. The individual tables are
+expected to have filenames in the format `{lang}_{table}.json` under the
+specified URL directory as in the
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data/spacy_lookups_data/data/)
+repository.
+
+| Name        | Description                                                                                 |
+| ----------- | ------------------------------------------------------------------------------------------- |
+| `lang`      | The language. ~~str~~                                                                       |
+| `url`       | The URL for the directory where the tables can be downloaded. ~~str~~                       |
+| `tables`    | The tables to load. ~~List[str]~~                                                           |
+| **CREATES** | A function that loads the specified tables from the provided URL. ~~Callable[[], Lookups]~~ |
+
 ## Training data and alignment {id="gold",source="spacy/training"}
 
 ### training.offsets_to_biluo_tags {id="offsets_to_biluo_tags",tag="function"}
diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx
index b8b4917f2b2..6faad1d6a0f 100644
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@@ -59,19 +59,18 @@ $ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
 ```
 
 spaCy also lets you install extra dependencies by specifying the following
-keywords in brackets, e.g. `spacy[ja]` or `spacy[lookups,transformers]` (with
+keywords in brackets, e.g. `spacy[ja]` or `spacy[apple,transformers]` (with
 multiple comma-separated extras). See the `[options.extras_require]` section in
 spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
 
 > #### Example
 >
 > ```bash
-> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
+> $ pip install %%SPACY_PKG_NAME[apple,transformers]%%SPACY_PKG_FLAGS
 > ```
 
 | Name             | Description                                                                                                                                                                                                                                                    |
 | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lookups`        | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
 | `transformers`   | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline.                                                                                    |
 | `cuda`, ...      | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options.                                                                                |
 | `apple`          | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1.                                                                                                                                               |
@@ -174,7 +173,7 @@ $ pip install --no-build-isolation --editable . # compile and install spaCy
 To install with extras:
 
 ```bash
-$ pip install --no-build-isolation --editable .[lookups,cuda102]
+$ pip install --no-build-isolation --editable .[ja,cuda102]
 ```
 
 How to install compilers and related build tools:
diff --git a/website/docs/usage/linguistic-features.mdx b/website/docs/usage/linguistic-features.mdx
index 21cedd1ef2c..26d1ad37962 100644
--- a/website/docs/usage/linguistic-features.mdx
+++ b/website/docs/usage/linguistic-features.mdx
@@ -148,11 +148,11 @@ component.
 
 </Infobox>
 
-The data for spaCy's lemmatizers is distributed in the package
+The data for spaCy's lemmatizers is distributed in the repository
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
 provided trained pipelines already include all the required tables, but if you
-are creating new pipelines, you'll probably want to install `spacy-lookups-data`
-to provide the data when the lemmatizer is initialized.
+are creating new pipelines, you can load data from the repository in the
+lemmatizer initialization.
 
 ### Lookup lemmatizer {id="lemmatizer-lookup"}
 
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 43e3a0eeb6c..f4e0a01e8ca 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -50,7 +50,6 @@ const QuickstartInstall = ({ id, title }) => {
     const pipExtras = [
         hardware === 'gpu' && (platform !== 'arm' || os === 'linux') && cuda,
         train && 'transformers',
-        train && 'lookups',
         apple && 'apple',
         ...modelExtras,
     ]
@@ -214,9 +213,6 @@ const QuickstartInstall = ({ id, title }) => {
             <QS config="train" package="conda" comment prompt={false}>
                 # packages only available via pip
             </QS>
-            <QS config="train" package="conda">
-                pip install spacy-lookups-data
-            </QS>
 
             {languages.map(({ code, models: modelOptions }) => {
                 const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]

From c4ecb845379d6e216ce4f02833dda45792e93b3c Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 1 Aug 2023 22:24:02 +0900
Subject: [PATCH 195/504] Accept multiple code files in all CLI commands
 (#12101)

* Add support for multiple code files to all relevant commands

Prior to this, only the package command supported multiple code files.

* Update docs

* Add debug data test, plus generic fixtures

One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.

* Add debug config test and restructure

The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.

* Use a more generic, parametrized test

* Add output arg for assemble and pretrain

Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.

* Add evaluate test and some cleanup

* Mark tests as slow

* Revert argument name change

* Apply suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Format API CLI docs

* isort

* Fix imports in tests

* isort

* Undo changes to package CLI help

* Fix python executable and lang code in test

* Fix executable in another test

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 spacy/cli/_util.py          |   7 ++
 spacy/cli/assemble.py       |   6 +-
 spacy/cli/debug_config.py   |   6 +-
 spacy/cli/debug_data.py     |   6 +-
 spacy/cli/evaluate.py       |   6 +-
 spacy/cli/package.py        |   2 +-
 spacy/cli/pretrain.py       |   6 +-
 spacy/cli/train.py          |   6 +-
 spacy/tests/test_cli_app.py | 206 ++++++++++++++++++++++++++++++++++++
 website/docs/api/cli.mdx    | 108 +++++++++----------
 10 files changed, 286 insertions(+), 73 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index b005accf91f..eed61119070 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -348,6 +348,13 @@ def show_validation_error(
         msg.fail("Config validation error", e, exits=1)
 
 
+def import_code_paths(code_paths: str) -> None:
+    """Helper to import comma-separated list of code paths."""
+    code_paths = [Path(p.strip()) for p in string_to_list(code_paths)]
+    for code_path in code_paths:
+        import_code(code_path)
+
+
 def import_code(code_path: Optional[Union[Path, str]]) -> None:
     """Helper to import Python file provided in training commands / commands
     using the config. This makes custom registered functions available.
diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py
index f74bbacb555..7ad0f52fe1d 100644
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@@ -11,7 +11,7 @@
     Arg,
     Opt,
     app,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     show_validation_error,
 )
@@ -26,7 +26,7 @@ def assemble_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
     output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     # fmt: on
 ):
@@ -46,7 +46,7 @@ def assemble_cli(
     if not config_path or (str(config_path) != "-" and not config_path.exists()):
         msg.fail("Config file not found", config_path, exits=1)
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides, interpolate=False)
     msg.divider("Initializing pipeline")
diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 0e5382cd956..7818b4087e7 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -13,7 +13,7 @@
     Arg,
     Opt,
     debug_cli,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     show_validation_error,
 )
@@ -27,7 +27,7 @@ def debug_config_cli(
     # fmt: off
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
     show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
     # fmt: on
@@ -44,7 +44,7 @@ def debug_config_cli(
     DOCS: https://spacy.io/api/cli#debug-config
     """
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     debug_config(
         config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars
     )
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 4c44a8c0e2b..714969be145 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -40,7 +40,7 @@
     _format_number,
     app,
     debug_cli,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     show_validation_error,
 )
@@ -72,7 +72,7 @@ def debug_data_cli(
     # fmt: off
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
     verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
     no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
@@ -92,7 +92,7 @@ def debug_data_cli(
             "--help for an overview of the other available debugging commands."
         )
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     debug_data(
         config_path,
         config_overrides=overrides,
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 2276ca6b0d4..c3527028e9d 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -10,7 +10,7 @@
 from ..scorer import Scorer
 from ..tokens import Doc
 from ..training import Corpus
-from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
+from ._util import Arg, Opt, app, benchmark_cli, import_code_paths, setup_gpu
 
 
 @benchmark_cli.command(
@@ -22,7 +22,7 @@ def evaluate_cli(
     model: str = Arg(..., help="Model name or path"),
     data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
     output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
     gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
@@ -43,7 +43,7 @@ def evaluate_cli(
 
     DOCS: https://spacy.io/api/cli#benchmark-accuracy
     """
-    import_code(code_path)
+    import_code_paths(code_path)
     evaluate(
         model,
         data_path,
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 9421199f111..06b503271af 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -23,7 +23,7 @@ def package_cli(
     # fmt: off
     input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
     output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
-    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
+    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be included in the package"),
     meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
     create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
     name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 446c40510df..73337a7ca98 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -11,7 +11,7 @@
     Arg,
     Opt,
     app,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     setup_gpu,
     show_validation_error,
@@ -27,7 +27,7 @@ def pretrain_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
     output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
@@ -56,7 +56,7 @@ def pretrain_cli(
     DOCS: https://spacy.io/api/cli#pretrain
     """
     config_overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
     setup_gpu(use_gpu)
     msg.info(f"Loading config from: {config_path}")
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index c72e13b2681..40934f546e2 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -13,7 +13,7 @@
     Arg,
     Opt,
     app,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     setup_gpu,
     show_validation_error,
@@ -28,7 +28,7 @@ def train_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
     output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
     # fmt: on
@@ -50,7 +50,7 @@ def train_cli(
     if verbose:
         util.logger.setLevel(logging.DEBUG)
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
 
 
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 32ca639b37d..f9c1a9d6579 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -1,4 +1,5 @@
 import os
+import subprocess
 import sys
 from pathlib import Path
 
@@ -6,6 +7,7 @@
 import srsly
 from typer.testing import CliRunner
 
+import spacy
 from spacy.cli._util import app, get_git_version
 from spacy.tokens import Doc, DocBin
 
@@ -47,6 +49,210 @@ def test_convert_auto_conflict():
         assert len(out_files) == 0
 
 
+NOOP_CONFIG = """
+[paths]
+train = null
+dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+seed = 0
+gpu_allocator = null
+
+[nlp]
+lang = "mul"
+pipeline = ["noop", "noop2"]
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+batch_size = 1000
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+[components]
+
+[components.noop]
+factory = "noop"
+
+[components.noop2]
+factory = "noop2"
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[training]
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1600
+max_epochs = 0
+max_steps = 100
+eval_frequency = 200
+frozen_components = []
+annotating_components = []
+dev_corpus = "corpora.dev"
+
+train_corpus = "corpora.train"
+before_to_disk = null
+before_update = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
+"""
+
+
+@pytest.fixture
+def data_paths():
+    nlp = spacy.blank("mul")
+    doc = nlp("ok")
+    with make_tempdir() as tdir:
+        db = DocBin()
+        # debug data will *fail* if there aren't enough docs
+        for ii in range(100):
+            db.add(doc)
+        fpath = tdir / "data.spacy"
+        db.to_disk(fpath)
+
+        args = [
+            "--paths.train",
+            str(fpath),
+            "--paths.dev",
+            str(fpath),
+        ]
+        yield args
+
+
+@pytest.fixture
+def code_paths():
+    noop_base = """
+from spacy.language import Language
+
+@Language.component("{}")
+def noop(doc):
+    return doc
+"""
+
+    with make_tempdir() as temp_d:
+        # write code files to load
+        paths = []
+        for ff in ["noop", "noop2"]:
+            pyfile = temp_d / f"{ff}.py"
+            pyfile.write_text(noop_base.format(ff))
+            paths.append(pyfile)
+
+        args = ["--code", ",".join([str(pp) for pp in paths])]
+        yield args
+
+
+@pytest.fixture
+def noop_config():
+    with make_tempdir() as temp_d:
+        cfg = temp_d / "config.cfg"
+        cfg.write_text(NOOP_CONFIG)
+
+        yield cfg
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "cmd",
+    ["debug config", "debug data", "train", "assemble"],
+)
+def test_multi_code(cmd, code_paths, data_paths, noop_config):
+    # check that it fails without the code arg
+    cmd = cmd.split()
+    output = ["."] if cmd[0] == "assemble" else []
+    cmd = [sys.executable, "-m", "spacy"] + cmd
+    result = subprocess.run([*cmd, str(noop_config), *output, *data_paths])
+    assert result.returncode == 1
+
+    # check that it succeeds with the code arg
+    result = subprocess.run([*cmd, str(noop_config), *output, *data_paths, *code_paths])
+    assert result.returncode == 0
+
+
+@pytest.mark.slow
+def test_multi_code_evaluate(code_paths, data_paths, noop_config):
+    # Evaluation requires a model, not a config, so this works differently from
+    # the other commands.
+
+    # Train a model to evaluate
+    cmd = f"{sys.executable} -m spacy train {noop_config} -o model".split()
+    result = subprocess.run([*cmd, *data_paths, *code_paths])
+    assert result.returncode == 0
+
+    # now do the evaluation
+
+    eval_data = data_paths[-1]
+    cmd = f"{sys.executable} -m spacy evaluate model/model-best {eval_data}".split()
+
+    # check that it fails without the code arg
+    result = subprocess.run(cmd)
+    assert result.returncode == 1
+
+    # check that it succeeds with the code arg
+    result = subprocess.run([*cmd, *code_paths])
+    assert result.returncode == 0
+
+
 def test_benchmark_accuracy_alias():
     # Verify that the `evaluate` alias works correctly.
     result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 3f91e1ff71e..765bcb8c675 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -176,15 +176,15 @@ validation error with more details.
 $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 ```
 
-| Name                   | Description                                                                                                                                                                          |
-| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `base_path`            | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~                                                            |
-| `output_file`          | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~                                                   |
-| `--code`, `-c`         | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                    |
-| `--diff`, `-D`         | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                                                                        |
-| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**            | Complete and auto-filled config file for training.                                                                                                                                   |
+| Name                   | Description                                                                                                                                                                                            |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `base_path`            | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~                                                                              |
+| `output_file`          | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~                                                                     |
+| `--code`, `-c`         | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                                      |
+| `--diff`, `-D`         | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                                                                                          |
+| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
+| **CREATES**            | Complete and auto-filled config file for training.                                                                                                                                                     |
 
 ### init fill-curated-transformer {id="init-fill-curated-transformer",version="3.7",tag="command"}
 
@@ -266,7 +266,7 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `output_path`     | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~                                                                                                               |
-| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--code`, `-c`    | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--verbose`, `-V` | Show more detailed messages for debugging purposes. ~~bool (flag)~~                                                                                                                                                |
 | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
@@ -491,7 +491,7 @@ File       /path/to/thinc/thinc/schedules.py (line 91)
 | Name                     | Description                                                                                                                                                                                                                    |
 | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`            | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~             |
-| `--code`, `-c`           | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                                           |
+| `--code`, `-c`           | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                         |
 | `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~                                             |
 | `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ |
 | `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                     |
@@ -676,7 +676,7 @@ will not be available.
 | Name                       | Description                                                                                                                                                                                                        |
 | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`              | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
-| `--code`, `-c`             | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--code`, `-c`             | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~                                                                                                                                                       |
 | `--verbose`, `-V`          | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                     |
 | `--no-format`, `-NF`       | Don't pretty-print the results. Use this if you want to write to a file. ~~bool (flag)~~                                                                                                                           |
@@ -1136,7 +1136,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                          |
-| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--code`, `-c`    | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                                                       |
 | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
@@ -1206,6 +1206,7 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
 | -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`                                      | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `output_dir`                                       | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                                                           |
+| `--code`, `-c`                                     | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--code`, `-c`                                     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
 | `--resume-path`, `-r`                              | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                                                          |
 | `--epoch-resume`, `-er`                            | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                                                |
@@ -1243,20 +1244,19 @@ skew. To render a sample of dependency parses in a HTML file using the
 $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] [--per-component] [--spans-key]
 ```
 
-| Name                                                 | Description                                                                                                                                                                          |
-| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                                              | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`                                          | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                            |
-| `--output`, `-o`                                     | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                                  |
-| `--code`, `-c` <Tag variant="new">3</Tag>            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gold-preproc`, `-G`                               | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--gpu-id`, `-g`                                     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                           |
-| `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~            |
-| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                           |
-| `--spans-key`, `-sk` <Tag variant="new">3.6.2</Tag>  | Spans key to use when evaluating `Doc.spans`. Defaults to `sc`. ~~str (option)~~                                                                                                     |
-| `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                            |
+| Name                                                 | Description                                                                                                                                                                                            |
+| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                              | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                             |
+| `data_path`                                          | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                                              |
+| `--output`, `-o`                                     | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                                                    |
+| `--code`, `-c` <Tag variant="new">3</Tag>            | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gold-preproc`, `-G`                               | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                                                |
+| `--gpu-id`, `-g`                                     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
+| `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                                             |
+| `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~                              |
+| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                                             |
+| `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
+| **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                                              |
 
 ### speed {id="benchmark-speed", version="3.5", tag="command"}
 
@@ -1302,19 +1302,19 @@ If you want to evaluate the pipeline on raw text only, make sure that the .spacy
 $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
 ```
 
-| Name                      | Description                                                                                                                                                                          |
-| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  |
-| `data_path`               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 |
-| `output-file`             | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
-| `--code`, `-c`            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--text-key`, `-tk`       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            |
-| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    |
-| `--gpu-id`, `-g`          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--batch-size`, `-b`      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                  |
-| `--n-process`, `-n`       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                         |
-| `--help`, `-h`            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             |
+| Name                      | Description                                                                                                                                                                                            |
+| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                    |
+| `data_path`               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                                   |
+| `output-file`             | Output `DocBin` path. ~~str (positional)~~                                                                                                                                                             |
+| `--code`, `-c`            | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--text-key`, `-tk`       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                                              |
+| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                                      |
+| `--gpu-id`, `-g`          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
+| `--batch-size`, `-b`      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                                    |
+| `--n-process`, `-n`       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                           |
+| `--help`, `-h`            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
+| **CREATES**               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                                               |
 
 ## find-threshold {id="find-threshold",version="3.5",tag="command"}
 
@@ -1341,19 +1341,19 @@ be provided.
 > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
 > ```
 
-| Name                    | Description                                                                                                                                                                          |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                 | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`             | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
-| `pipe_name`             | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
-| `threshold_key`         | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
-| `scores_key`            | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
-| `--n_trials`, `-n`      | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
-| `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gpu-id`, `-g`        | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--gold-preproc`, `-G`  | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+| Name                     | Description                                                                                                                                                                                            |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                  | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                             |
+| `data_path`              | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                                  |
+| `pipe_name`              | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                                           |
+| `threshold_key`          | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                                          |
+| `scores_key`             | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                                              |
+| `--n_trials`, `-n`       | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                                     |
+| `--code`, `-c`           | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gpu-id`, `-g`         | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
+| `--gold-preproc`, `-G`   | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                                                |
+| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                                       |
+| `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
 
 ## assemble {#assemble tag="command"}
 
@@ -1377,7 +1377,7 @@ $ python -m spacy assemble [config_path] [output_dir] [--code] [--verbose] [over
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `config_path`     | Path to the [config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `output_dir`      | Directory to store the final pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                   |
-| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions). ~~Optional[Path] \(option)~~                                                |
+| `--code`, `-c`    | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~        |
 | `--verbose`, `-V` | Show more detailed messages during processing. ~~bool (flag)~~                                                                                                                                                |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                    |
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.data ./data`. ~~Any (option/flag)~~                            |

From c5b1f03725236f6b93db9be724990d2714270f0e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 9 Aug 2023 10:55:52 +0200
Subject: [PATCH 196/504] Switch zh tokenizer default pkuseg_model to
 spacy_ontonotes (#12896)

So that users can use `copy_from_base_model` for other segmenters
without having to override an irrelevant `pkuseg_model` setting, switch
the default `pkuseg_model` to `spacy_ontonotes`.
---
 spacy/lang/zh/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index f7bb092771c..6b980b52b61 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -31,7 +31,7 @@
 [initialize]
 
 [initialize.tokenizer]
-pkuseg_model = null
+pkuseg_model = "spacy_ontonotes"
 pkuseg_user_dict = "default"
 """
 

From 532f6fb9ca5a6d2f2ec5865e394e2d66e169a98c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 14:38:05 +0100
Subject: [PATCH 197/504] Revert "Reimplement distillation with oracle cut size
 (#12214)"

This reverts commit e27c60a70263f7ab17968964de37e938653e37a2.
---
 spacy/ml/tb_framework.pyx            |   4 +-
 spacy/pipeline/transition_parser.pyx | 105 ++++++---------------------
 spacy/tests/parser/test_model.py     |  61 ----------------
 spacy/tests/parser/test_ner.py       |   5 +-
 spacy/tests/parser/test_parse.py     |   5 +-
 5 files changed, 24 insertions(+), 156 deletions(-)
 delete mode 100644 spacy/tests/parser/test_model.py

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 6c5c29d8549..e497643f0cd 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -267,11 +267,9 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
     cdef np.ndarray step_actions
 
     scores = []
-    while sizes.states >= 1 and (actions is None or len(actions) > 0):
+    while sizes.states >= 1:
         step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
         step_actions = actions[0] if actions is not None else None
-        assert step_actions is None or step_actions.size == sizes.states, \
-            f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
         with nogil:
             _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
             if actions is None:
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 9fa0d4987b8..99970b3fe93 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -49,10 +49,6 @@ cdef extern from "<algorithm>" namespace "std" nogil:
 
 
 
-# TODO: Remove when we switch to Cython 3.
-cdef extern from "<algorithm>" namespace "std" nogil:
-    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
-
 NUMPY_OPS = NumpyOps()
 
 
@@ -271,8 +267,8 @@ class Parser(TrainablePipe):
             # batch uniform length. Since we do not have a gold standard
             # sequence, we use the teacher's predictions as the gold
             # standard.
-            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
-            states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves)
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            states = self._init_batch(teacher_pipe, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
@@ -283,14 +279,12 @@ class Parser(TrainablePipe):
         # gradients of the student's transition distributions relative to the
         # teacher's distributions.
 
-        student_inputs = TransitionModelInputs(docs=student_docs,
-                                               states=[state.copy() for state in states],
-                                               moves=self.moves,
-                                               max_moves=max_moves)
+        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
+            max_moves=max_moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = _states_diff_to_actions(states, student_states)
+        actions = states2actions(student_states)
         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-                                               states=states, moves=teacher_pipe.moves, actions=actions)
+            moves=self.moves, actions=actions)
         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@@ -538,7 +532,7 @@ class Parser(TrainablePipe):
         set_dropout_rate(self.model, 0.0)
         student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = _states_to_actions(student_states)
+        actions = states2actions(student_states)
         teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
         _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
 
@@ -658,7 +652,7 @@ class Parser(TrainablePipe):
                     raise ValueError(Errors.E149) from None
         return self
 
-    def _init_batch_from_teacher(self, teacher_pipe, docs, max_length):
+    def _init_batch(self, teacher_step_model, docs, max_length):
         """Make a square batch of length equal to the shortest transition
         sequence or a cap. A long
         doc will get multiple states. Let's say we have a doc of length 2*N,
@@ -667,12 +661,10 @@ class Parser(TrainablePipe):
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
         cdef:
+            StateClass start_state
             StateClass state
-            TransitionSystem moves = teacher_pipe.moves
-
-        # Start with the same heuristic as in supervised training: exclude
-        # docs that are within the maximum length.
-        all_states = moves.init_batch(docs)
+            Transition action
+        all_states = self.moves.init_batch(docs)
         states = []
         to_cut = []
         for state, doc in zip(all_states, docs):
@@ -681,30 +673,19 @@ class Parser(TrainablePipe):
                     states.append(state)
                 else:
                     to_cut.append(state)
-
-        if not to_cut:
-            return states
-
-        # Parse the states that are too long with the teacher's parsing model.
-        teacher_inputs = TransitionModelInputs(docs=docs,
-                                               moves=moves,
-                                               states=[state.copy() for state in to_cut])
-        (teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)
-
-        # Step through the teacher's actions and store every state after
-        # each multiple of max_length.
-        teacher_actions = _states_to_actions(teacher_states)
         while to_cut:
             states.extend(state.copy() for state in to_cut)
-            for step_actions in teacher_actions[:max_length]:
-                to_cut = moves.apply_actions(to_cut, step_actions)
-            teacher_actions = teacher_actions[max_length:]
-
-            if len(teacher_actions) < max_length:
-                break
-
+            # Move states forward max_length actions.
+            length = 0
+            while to_cut and length < max_length:
+                teacher_scores = teacher_step_model.predict(to_cut)
+                self.transition_states(to_cut, teacher_scores)
+                # States that are completed do not need further cutting.
+                to_cut = [state for state in to_cut if not state.is_final()]
+                length += 1
         return states
 
+
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
         sequence or a cap. A long doc will get multiple states. Let's say we
@@ -765,7 +746,7 @@ def _change_attrs(model, **kwargs):
             model.attrs[key] = value
 
 
-def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
+def states2actions(states: List[StateClass]) -> List[Ints1d]:
     cdef int step
     cdef StateClass state
     cdef StateC* c_state
@@ -786,47 +767,3 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
         actions.append(numpy.array(step_actions, dtype="i"))
 
     return actions
-
-
-def _states_diff_to_actions(
-    before_states: List[StateClass],
-    after_states: List[StateClass]
-) -> List[Ints1d]:
-    """
-    Return for two sets of states the actions to go from the first set of
-    states to the second set of states. The histories of the first set of
-    states must be a prefix of the second set of states.
-    """
-    cdef StateClass before_state, after_state
-    cdef StateC* c_state_before
-    cdef StateC* c_state_after
-
-    assert len(before_states) == len(after_states)
-
-    # Check invariant: before states histories must be prefixes of after states.
-    for before_state, after_state in zip(before_states, after_states):
-        c_state_before = before_state.c
-        c_state_after = after_state.c
-
-        assert equal(c_state_before.history.begin(),
-                     c_state_before.history.end(),
-                     c_state_after.history.begin())
-
-    actions = []
-    while True:
-        step = len(actions)
-
-        step_actions = []
-        for before_state, after_state in zip(before_states, after_states):
-            c_state_before = before_state.c
-            c_state_after = after_state.c
-            if step < c_state_after.history.size() - c_state_before.history.size():
-                step_actions.append(c_state_after.history[c_state_before.history.size() + step])
-
-        # We are done if we have exhausted all histories.
-        if len(step_actions) == 0:
-            break
-
-        actions.append(numpy.array(step_actions, dtype="i"))
-
-    return actions
diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py
deleted file mode 100644
index 8c1cf7a9346..00000000000
--- a/spacy/tests/parser/test_model.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import numpy
-import pytest
-
-from spacy.lang.en import English
-from spacy.ml.tb_framework import TransitionModelInputs
-from spacy.training import Example
-
-TRAIN_DATA = [
-    (
-        "They trade mortgage-backed securities.",
-        {
-            "heads": [1, 1, 4, 4, 5, 1, 1],
-            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
-        },
-    ),
-    (
-        "I like London and Berlin.",
-        {
-            "heads": [1, 1, 1, 2, 2, 1],
-            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
-        },
-    ),
-]
-
-
-@pytest.fixture
-def nlp_parser():
-    nlp = English()
-    parser = nlp.add_pipe("parser")
-
-    train_examples = []
-    for text, annotations in TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-        for dep in annotations["deps"]:
-            parser.add_label(dep)
-    nlp.initialize()
-
-    return nlp, parser
-
-
-def test_incorrect_number_of_actions(nlp_parser):
-    nlp, parser = nlp_parser
-    doc = nlp.make_doc("test")
-
-    # Too many actions for the number of docs
-    with pytest.raises(AssertionError):
-        parser.model.predict(
-            TransitionModelInputs(
-                docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
-            )
-        )
-
-    # Too few actions for the number of docs
-    with pytest.raises(AssertionError):
-        parser.model.predict(
-            TransitionModelInputs(
-                docs=[doc, doc],
-                moves=parser.moves,
-                actions=[numpy.array([0], dtype="i")],
-            )
-        )
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index f0efc3a63fd..bb9b7653ce3 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -623,9 +623,7 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-@pytest.mark.slow
-@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
-def test_distill(max_moves):
+def test_distill():
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -643,7 +641,6 @@ def test_distill(max_moves):
 
     student = English()
     student_ner = student.add_pipe("ner")
-    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 636bb887789..d25eb165acb 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -462,9 +462,7 @@ def test_is_distillable():
     assert parser.is_distillable
 
 
-@pytest.mark.slow
-@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
-def test_distill(max_moves):
+def test_distill():
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []
@@ -482,7 +480,6 @@ def test_distill(max_moves):
 
     student = English()
     student_parser = student.add_pipe("parser")
-    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
     student_parser.initialize(
         get_examples=lambda: train_examples, labels=teacher_parser.label_data
     )

From 55a61359a07e3def147e6c12d494e376b097e41d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:23:08 +0100
Subject: [PATCH 198/504] Revert "Merge the parser refactor into `v4` (#10940)"

This reverts commit a183db3cefe0713e828868b43daf61c464cae05c.
---
 setup.py                                      |   6 +-
 spacy/cli/templates/quickstart_training.jinja |  12 +-
 spacy/compat.py                               |   5 +
 spacy/errors.py                               |   7 +-
 spacy/ml/_precomputable_affine.py             | 164 +++++
 spacy/ml/models/parser.py                     | 177 +++--
 spacy/ml/parser_model.pxd                     |  49 ++
 spacy/ml/parser_model.pyx                     | 500 ++++++++++++++
 spacy/ml/tb_framework.pxd                     |  28 -
 spacy/ml/tb_framework.py                      |  50 ++
 spacy/ml/tb_framework.pyx                     | 639 ------------------
 .../_parser_internals/_beam_utils.pyx         |   4 +-
 .../_parser_internals/_parser_utils.pxd       |   2 -
 .../_parser_internals/_parser_utils.pyx       |  22 -
 spacy/pipeline/_parser_internals/_state.pxd   |  60 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 -
 spacy/pipeline/_parser_internals/batch.pxd    |   2 -
 spacy/pipeline/_parser_internals/batch.pyx    |  52 --
 spacy/pipeline/_parser_internals/ner.pyx      |   6 +-
 .../pipeline/_parser_internals/stateclass.pyx |   7 -
 .../_parser_internals/transition_system.pxd   |   7 -
 .../_parser_internals/transition_system.pyx   |  73 --
 .../{dep_parser.py => dep_parser.pyx}         |  20 +-
 spacy/pipeline/{ner.py => ner.pyx}            |  35 +-
 spacy/pipeline/transition_parser.pxd          |  21 +
 spacy/pipeline/transition_parser.pyx          | 504 ++++++++------
 spacy/tests/parser/test_ner.py                |  10 +-
 spacy/tests/parser/test_parse.py              |  74 +-
 spacy/tests/pipeline/test_tok2vec.py          |   2 +-
 .../tests/serialize/test_serialize_config.py  |  56 +-
 spacy/tests/test_misc.py                      |  55 +-
 spacy/training/example.pyx                    |   2 +-
 website/docs/api/architectures.mdx            |  26 +-
 website/docs/api/cli.mdx                      |   8 +-
 website/docs/api/legacy.mdx                   |   2 +-
 .../docs/usage/embeddings-transformers.mdx    |   6 +-
 36 files changed, 1384 insertions(+), 1312 deletions(-)
 create mode 100644 spacy/ml/_precomputable_affine.py
 create mode 100644 spacy/ml/parser_model.pxd
 create mode 100644 spacy/ml/parser_model.pyx
 delete mode 100644 spacy/ml/tb_framework.pxd
 create mode 100644 spacy/ml/tb_framework.py
 delete mode 100644 spacy/ml/tb_framework.pyx
 delete mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pxd
 delete mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pyx
 delete mode 100644 spacy/pipeline/_parser_internals/batch.pxd
 delete mode 100644 spacy/pipeline/_parser_internals/batch.pyx
 rename spacy/pipeline/{dep_parser.py => dep_parser.pyx} (96%)
 rename spacy/pipeline/{ner.py => ner.pyx} (93%)
 create mode 100644 spacy/pipeline/transition_parser.pxd

diff --git a/setup.py b/setup.py
index a4a87d68aec..0eb529c2098 100755
--- a/setup.py
+++ b/setup.py
@@ -32,10 +32,12 @@
     "spacy.kb.candidate",
     "spacy.kb.kb",
     "spacy.kb.kb_in_memory",
-    "spacy.ml.tb_framework",
+    "spacy.ml.parser_model",
     "spacy.morphology",
+    "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
+    "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
     "spacy.pipeline.sentencizer",
@@ -43,7 +45,6 @@
     "spacy.pipeline.tagger",
     "spacy.pipeline.transition_parser",
     "spacy.pipeline._parser_internals.arc_eager",
-    "spacy.pipeline._parser_internals.batch",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
     "spacy.pipeline._parser_internals.search",
@@ -51,7 +52,6 @@
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.pipeline._parser_internals._beam_utils",
-    "spacy.pipeline._parser_internals._parser_utils",
     "spacy.tokenizer",
     "spacy.training.align",
     "spacy.training.gold_io",
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 36325711d4d..2817147f3e9 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -90,11 +90,12 @@ grad_factor = 1.0
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
+use_upper = false
 nO = null
 
 [components.parser.model.tok2vec]
@@ -110,11 +111,12 @@ grad_factor = 1.0
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = false
 nO = null
 
 [components.ner.model.tok2vec]
@@ -385,11 +387,12 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
+use_upper = true
 nO = null
 
 [components.parser.model.tok2vec]
@@ -402,11 +405,12 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
diff --git a/spacy/compat.py b/spacy/compat.py
index 1e63807a0e8..30459e2e495 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,6 +23,11 @@
 except ImportError:
     cupy = None
 
+if sys.version_info[:2] >= (3, 8):  # Python 3.8+
+    from typing import Literal, Protocol, runtime_checkable
+else:
+    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
+
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/errors.py b/spacy/errors.py
index adca5880283..a5d0b3d11a9 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -217,12 +217,6 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
-    # v4 warning strings
-    W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
-    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
-            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
-            "to return `True` in `.supports_prior_probs`.")
-
 
 class Errors(metaclass=ErrorsWithCodes):
     E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
@@ -1007,6 +1001,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
+
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
 # fmt: on
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
new file mode 100644
index 00000000000..1c20c622b2c
--- /dev/null
+++ b/spacy/ml/_precomputable_affine.py
@@ -0,0 +1,164 @@
+from thinc.api import Model, normal_init
+
+from ..util import registry
+
+
+@registry.layers("spacy.PrecomputableAffine.v1")
+def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
+    model = Model(
+        "precomputable_affine",
+        forward,
+        init=init,
+        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
+        params={"W": None, "b": None, "pad": None},
+        attrs={"dropout_rate": dropout},
+    )
+    return model
+
+
+def forward(model, X, is_train):
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.get_param("W")
+    # Preallocate array for layer output, including padding.
+    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
+    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
+    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
+
+    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
+    # change its shape to (nF, nO, nP) without breaking existing models. So
+    # we'll squeeze the first dimension here.
+    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
+
+    def backward(dY_ids):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nO, nP), and get back:
+        # (nB, nO, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nO, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        assert dY.ndim == 3
+        assert dY.shape[1] == nO, dY.shape
+        assert dY.shape[2] == nP, dY.shape
+        # nB = dY.shape[0]
+        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
+        Xf = X[ids]
+        Xf = Xf.reshape((Xf.shape[0], nF * nI))
+
+        model.inc_grad("b", dY.sum(axis=0))
+        dY = dY.reshape((dY.shape[0], nO * nP))
+
+        Wopfi = W.transpose((1, 2, 0, 3))
+        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
+        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
+
+        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
+        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
+        # (o, p, f, i) --> (f, o, p, i)
+        dWopfi = dWopfi.transpose((2, 0, 1, 3))
+        model.inc_grad("W", dWopfi)
+        return dXf.reshape((dXf.shape[0], nF, nI))
+
+    return Yf, backward
+
+
+def _backprop_precomputable_affine_padding(model, dY, ids):
+    nB = dY.shape[0]
+    nF = model.get_dim("nF")
+    nP = model.get_dim("nP")
+    nO = model.get_dim("nO")
+    # Backprop the "padding", used as a filler for missing values.
+    # Values that are missing are set to -1, and each state vector could
+    # have multiple missing values. The padding has different values for
+    # different missing features. The gradient of the padding vector is:
+    #
+    # for b in range(nB):
+    #     for f in range(nF):
+    #         if ids[b, f] < 0:
+    #             d_pad[f] += dY[b]
+    #
+    # Which can be rewritten as:
+    #
+    # (ids < 0).T @ dY
+    mask = model.ops.asarray(ids < 0, dtype="f")
+    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
+    return d_pad.reshape((1, nF, nO, nP))
+
+
+def init(model, X=None, Y=None):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    if model.has_param("W") and model.get_param("W").any():
+        return
+
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nO, nP, nI)
+    b = model.ops.alloc2f(nO, nP)
+    pad = model.ops.alloc4f(1, nF, nO, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
+        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors = vectors.reshape((vectors.shape[0], nO, nP))
+        vectors += b
+        vectors = model.ops.asarray(vectors)
+        if nP >= 2:
+            return model.ops.maxout(vectors)[0]
+        else:
+            return vectors * (vectors >= 0)
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = model.get_param("W").copy()
+    b = model.get_param("b").copy()
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("b", b)
+        else:
+            break
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 422abf4e260..a70d84dea8f 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,66 +1,23 @@
-import warnings
-from typing import Any, List, Literal, Optional, Tuple
-
-from thinc.api import Model
+from typing import Optional, List, cast
+from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
 from thinc.types import Floats2d
 
-from ...errors import Errors, Warnings
-from ...tokens.doc import Doc
+from ...errors import Errors
+from ...compat import Literal
 from ...util import registry
+from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
-
-TransitionSystem = Any  # TODO
-State = Any  # TODO
-
-
-@registry.architectures.register("spacy.TransitionBasedParser.v2")
-def transition_parser_v2(
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    state_type: Literal["parser", "ner"],
-    extra_state_tokens: bool,
-    hidden_width: int,
-    maxout_pieces: int,
-    use_upper: bool,
-    nO: Optional[int] = None,
-) -> Model:
-    if not use_upper:
-        warnings.warn(Warnings.W400)
-
-    return build_tb_parser_model(
-        tok2vec,
-        state_type,
-        extra_state_tokens,
-        hidden_width,
-        maxout_pieces,
-        nO=nO,
-    )
-
-
-@registry.architectures.register("spacy.TransitionBasedParser.v3")
-def transition_parser_v3(
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    state_type: Literal["parser", "ner"],
-    extra_state_tokens: bool,
-    hidden_width: int,
-    maxout_pieces: int,
-    nO: Optional[int] = None,
-) -> Model:
-    return build_tb_parser_model(
-        tok2vec,
-        state_type,
-        extra_state_tokens,
-        hidden_width,
-        maxout_pieces,
-        nO=nO,
-    )
+from ...tokens import Doc
 
 
+@registry.architectures("spacy.TransitionBasedParser.v2")
 def build_tb_parser_model(
     tok2vec: Model[List[Doc], List[Floats2d]],
     state_type: Literal["parser", "ner"],
     extra_state_tokens: bool,
     hidden_width: int,
     maxout_pieces: int,
+    use_upper: bool,
     nO: Optional[int] = None,
 ) -> Model:
     """
@@ -94,7 +51,14 @@ def build_tb_parser_model(
         feature sets (for the NER) or 13 (for the parser).
     hidden_width (int): The width of the hidden layer.
     maxout_pieces (int): How many pieces to use in the state prediction layer.
-        Recommended values are 1, 2 or 3.
+        Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
+        is replaced with a ReLu non-linearity if use_upper=True, and no
+        non-linearity if use_upper=False.
+    use_upper (bool): Whether to use an additional hidden layer after the state
+        vector in order to predict the action scores. It is recommended to set
+        this to False for large pretrained models such as transformers, and True
+        for smaller networks. The upper layer is computed on CPU, which becomes
+        a bottleneck on larger GPU-based models, where it's also less necessary.
     nO (int or None): The number of actions the model will predict between.
         Usually inferred from data at the beginning of training, or loaded from
         disk.
@@ -105,11 +69,106 @@ def build_tb_parser_model(
         nr_feature_tokens = 6 if extra_state_tokens else 3
     else:
         raise ValueError(Errors.E917.format(value=state_type))
-    return TransitionModel(
-        tok2vec=tok2vec,
-        state_tokens=nr_feature_tokens,
-        hidden_width=hidden_width,
-        maxout_pieces=maxout_pieces,
-        nO=nO,
-        unseen_classes=set(),
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+    tok2vec = chain(
+        tok2vec,
+        list2array(),
+        Linear(hidden_width, t2v_width),
     )
+    tok2vec.set_dim("nO", hidden_width)
+    lower = _define_lower(
+        nO=hidden_width if use_upper else nO,
+        nF=nr_feature_tokens,
+        nI=tok2vec.get_dim("nO"),
+        nP=maxout_pieces,
+    )
+    upper = None
+    if use_upper:
+        with use_ops("cpu"):
+            # Initialize weights at zero, as it's a classification layer.
+            upper = _define_upper(nO=nO, nI=None)
+    return TransitionModel(tok2vec, lower, upper, resize_output)
+
+
+def _define_upper(nO, nI):
+    return Linear(nO=nO, nI=nI, init_W=zero_init)
+
+
+def _define_lower(nO, nF, nI, nP):
+    return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
+
+
+def resize_output(model, new_nO):
+    if model.attrs["has_upper"]:
+        return _resize_upper(model, new_nO)
+    return _resize_lower(model, new_nO)
+
+
+def _resize_upper(model, new_nO):
+    upper = model.get_ref("upper")
+    if upper.has_dim("nO") is None:
+        upper.set_dim("nO", new_nO)
+        return model
+    elif new_nO == upper.get_dim("nO"):
+        return model
+
+    smaller = upper
+    nI = smaller.maybe_get_dim("nI")
+    with use_ops("cpu"):
+        larger = _define_upper(nO=new_nO, nI=nI)
+    # it could be that the model is not initialized yet, then skip this bit
+    if smaller.has_param("W"):
+        larger_W = larger.ops.alloc2f(new_nO, nI)
+        larger_b = larger.ops.alloc1f(new_nO)
+        smaller_W = smaller.get_param("W")
+        smaller_b = smaller.get_param("b")
+        # Weights are stored in (nr_out, nr_in) format, so we're basically
+        # just adding rows here.
+        if smaller.has_dim("nO"):
+            old_nO = smaller.get_dim("nO")
+            larger_W[:old_nO] = smaller_W
+            larger_b[:old_nO] = smaller_b
+            for i in range(old_nO, new_nO):
+                model.attrs["unseen_classes"].add(i)
+
+        larger.set_param("W", larger_W)
+        larger.set_param("b", larger_b)
+    model._layers[-1] = larger
+    model.set_ref("upper", larger)
+    return model
+
+
+def _resize_lower(model, new_nO):
+    lower = model.get_ref("lower")
+    if lower.has_dim("nO") is None:
+        lower.set_dim("nO", new_nO)
+        return model
+
+    smaller = lower
+    nI = smaller.maybe_get_dim("nI")
+    nF = smaller.maybe_get_dim("nF")
+    nP = smaller.maybe_get_dim("nP")
+    larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
+    # it could be that the model is not initialized yet, then skip this bit
+    if smaller.has_param("W"):
+        larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
+        larger_b = larger.ops.alloc2f(new_nO, nP)
+        larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
+        smaller_W = smaller.get_param("W")
+        smaller_b = smaller.get_param("b")
+        smaller_pad = smaller.get_param("pad")
+        # Copy the old weights and padding into the new layer
+        if smaller.has_dim("nO"):
+            old_nO = smaller.get_dim("nO")
+            larger_W[:, 0:old_nO, :, :] = smaller_W
+            larger_pad[:, :, 0:old_nO, :] = smaller_pad
+            larger_b[0:old_nO, :] = smaller_b
+            for i in range(old_nO, new_nO):
+                model.attrs["unseen_classes"].add(i)
+
+        larger.set_param("W", larger_W)
+        larger.set_param("b", larger_b)
+        larger.set_param("pad", larger_pad)
+    model._layers[1] = larger
+    model.set_ref("lower", larger)
+    return model
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
new file mode 100644
index 00000000000..8def6cea53f
--- /dev/null
+++ b/spacy/ml/parser_model.pxd
@@ -0,0 +1,49 @@
+from libc.string cimport memset, memcpy
+from thinc.backends.cblas cimport CBlas
+from ..typedefs cimport weight_t, hash_t
+from ..pipeline._parser_internals._state cimport StateC
+
+
+cdef struct SizesC:
+    int states
+    int classes
+    int hiddens
+    int pieces
+    int feats
+    int embed_width
+
+
+cdef struct WeightsC:
+    const float* feat_weights
+    const float* feat_bias
+    const float* hidden_bias
+    const float* hidden_weights
+    const float* seen_classes
+
+
+cdef struct ActivationsC:
+    int* token_ids
+    float* unmaxed
+    float* scores
+    float* hiddens
+    int* is_valid
+    int _curr_size
+    int _max_size
+
+
+cdef WeightsC get_c_weights(model) except *
+
+cdef SizesC get_c_sizes(model, int batch_size) except *
+
+cdef ActivationsC alloc_activations(SizesC n) nogil
+
+cdef void free_activations(const ActivationsC* A) nogil
+
+cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
+        const WeightsC* W, SizesC n) nogil
+ 
+cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
+
+cdef void cpu_log_loss(float* d_scores,
+        const float* costs, const int* is_valid, const float* scores, int O) nogil
+ 
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
new file mode 100644
index 00000000000..91558683b60
--- /dev/null
+++ b/spacy/ml/parser_model.pyx
@@ -0,0 +1,500 @@
+# cython: infer_types=True, cdivision=True, boundscheck=False
+cimport numpy as np
+from libc.math cimport exp
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from thinc.backends.cblas cimport saxpy, sgemm
+
+import numpy
+import numpy.random
+from thinc.api import Model, CupyOps, NumpyOps, get_ops
+
+from .. import util
+from ..errors import Errors
+from ..typedefs cimport weight_t, class_t, hash_t
+from ..pipeline._parser_internals.stateclass cimport StateClass
+
+
+cdef WeightsC get_c_weights(model) except *:
+    cdef WeightsC output
+    cdef precompute_hiddens state2vec = model.state2vec
+    output.feat_weights = state2vec.get_feat_weights()
+    output.feat_bias = <const float*>state2vec.bias.data
+    cdef np.ndarray vec2scores_W
+    cdef np.ndarray vec2scores_b
+    if model.vec2scores is None:
+        output.hidden_weights = NULL
+        output.hidden_bias = NULL
+    else:
+        vec2scores_W = model.vec2scores.get_param("W")
+        vec2scores_b = model.vec2scores.get_param("b")
+        output.hidden_weights = <const float*>vec2scores_W.data
+        output.hidden_bias = <const float*>vec2scores_b.data
+    cdef np.ndarray class_mask = model._class_mask
+    output.seen_classes = <const float*>class_mask.data
+    return output
+
+
+cdef SizesC get_c_sizes(model, int batch_size) except *:
+    cdef SizesC output
+    output.states = batch_size
+    if model.vec2scores is None:
+        output.classes = model.state2vec.get_dim("nO")
+    else:
+        output.classes = model.vec2scores.get_dim("nO")
+    output.hiddens = model.state2vec.get_dim("nO")
+    output.pieces = model.state2vec.get_dim("nP")
+    output.feats = model.state2vec.get_dim("nF")
+    output.embed_width = model.tokvecs.shape[1]
+    return output
+
+
+cdef ActivationsC alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    resize_activations(&A, n)
+    return A
+
+
+cdef void free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.scores)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
+cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
+    if n.states <= A._max_size:
+        A._curr_size = n.states
+        return
+    if A._max_size == 0:
+        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
+        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
+        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
+        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    else:
+        A.token_ids = <int*>realloc(A.token_ids,
+            n.states * n.feats * sizeof(A.token_ids[0]))
+        A.scores = <float*>realloc(A.scores,
+            n.states * n.classes * sizeof(A.scores[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,
+            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>realloc(A.hiddens,
+            n.states * n.hiddens * sizeof(A.hiddens[0]))
+        A.is_valid = <int*>realloc(A.is_valid,
+            n.states * n.classes * sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    A._curr_size = n.states
+
+
+cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
+        const WeightsC* W, SizesC n) nogil:
+    cdef double one = 1.0
+    resize_activations(A, n)
+    for i in range(n.states):
+        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
+    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
+    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
+    sum_state_features(cblas, A.unmaxed,
+        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
+    for i in range(n.states):
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        for j in range(n.hiddens):
+            index = i * n.hiddens * n.pieces + j * n.pieces
+            which = _arg_max(&A.unmaxed[index], n.pieces)
+            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
+    memset(A.scores, 0, n.states * n.classes * sizeof(float))
+    if W.hidden_weights == NULL:
+        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
+    else:
+        # Compute hidden-to-output
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
+            1.0, <const float *>A.hiddens, n.hiddens,
+            <const float *>W.hidden_weights, n.hiddens,
+            0.0, A.scores, n.classes)
+        # Add bias
+        for i in range(n.states):
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
+    # Set unseen classes to minimum value
+    i = 0
+    min_ = A.scores[0]
+    for i in range(1, n.states * n.classes):
+        if A.scores[i] < min_:
+            min_ = A.scores[i]
+    for i in range(n.states):
+        for j in range(n.classes):
+            if not W.seen_classes[j]:
+                A.scores[i*n.classes+j] = min_
+
+
+cdef void sum_state_features(CBlas cblas, float* output,
+        const float* cached, const int* token_ids, int B, int F, int O) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    padding = cached
+    cached += F * O
+    cdef int id_stride = F*O
+    cdef float one = 1.
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                feature = &padding[f*O]
+            else:
+                idx = token_ids[f] * id_stride + f*O
+                feature = &cached[idx]
+            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
+        token_ids += F
+
+
+cdef void cpu_log_loss(float* d_scores,
+        const float* costs, const int* is_valid, const float* scores,
+        int O) nogil:
+    """Do multi-label log loss"""
+    cdef double max_, gmax, Z, gZ
+    best = arg_max_if_gold(scores, costs, is_valid, O)
+    guess = _arg_max(scores, O)
+
+    if best == -1 or guess == -1:
+        # These shouldn't happen, but if they do, we want to make sure we don't
+        # cause an OOB access.
+        return
+    Z = 1e-10
+    gZ = 1e-10
+    max_ = scores[guess]
+    gmax = scores[best]
+    for i in range(O):
+        Z += exp(scores[i] - max_)
+        if costs[i] <= costs[best]:
+            gZ += exp(scores[i] - gmax)
+    for i in range(O):
+        if costs[i] <= costs[best]:
+            d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
+        else:
+            d_scores[i] = exp(scores[i]-max_) / Z
+
+
+cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
+        const int* is_valid, int n) nogil:
+    # Find minimum cost
+    cdef float cost = 1
+    for i in range(n):
+        if is_valid[i] and costs[i] < cost:
+            cost = costs[i]
+    # Now find best-scoring with that cost
+    cdef int best = -1
+    for i in range(n):
+        if costs[i] <= cost and is_valid[i]:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
+
+
+cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if is_valid[i] >= 1:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
+
+
+
+class ParserStepModel(Model):
+    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
+            dropout=0.1):
+        Model.__init__(self, name="parser_step_model", forward=step_forward)
+        self.attrs["has_upper"] = has_upper
+        self.attrs["dropout_rate"] = dropout
+        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
+        if layers[1].get_dim("nP") >= 2:
+            activation = "maxout"
+        elif has_upper:
+            activation = None
+        else:
+            activation = "relu"
+        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
+                                            activation=activation, train=train)
+        if has_upper:
+            self.vec2scores = layers[-1]
+        else:
+            self.vec2scores = None
+        self.cuda_stream = util.get_cuda_stream(non_blocking=True)
+        self.backprops = []
+        self._class_mask = numpy.zeros((self.nO,), dtype='f')
+        self._class_mask.fill(1)
+        if unseen_classes is not None:
+            for class_ in unseen_classes:
+                self._class_mask[class_] = 0.
+
+    def clear_memory(self):
+        del self.tokvecs
+        del self.bp_tokvecs
+        del self.state2vec
+        del self.backprops
+        del self._class_mask
+
+    @property
+    def nO(self):
+        if self.attrs["has_upper"]:
+            return self.vec2scores.get_dim("nO")
+        else:
+            return self.state2vec.get_dim("nO")
+
+    def class_is_unseen(self, class_):
+        return self._class_mask[class_]
+
+    def mark_class_unseen(self, class_):
+        self._class_mask[class_] = 0
+
+    def mark_class_seen(self, class_):
+        self._class_mask[class_] = 1
+
+    def get_token_ids(self, states):
+        cdef StateClass state
+        states = [state for state in states if not state.is_final()]
+        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
+                                          dtype='i', order='C')
+        ids.fill(-1)
+        c_ids = <int*>ids.data
+        for state in states:
+            state.c.set_context_tokens(c_ids, ids.shape[1])
+            c_ids += ids.shape[1]
+        return ids
+
+    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
+        if isinstance(self.state2vec.ops, CupyOps) \
+        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+            # Move token_ids and d_vector to GPU, asynchronously
+            self.backprops.append((
+                util.get_async(self.cuda_stream, token_ids),
+                util.get_async(self.cuda_stream, d_vector),
+                get_d_tokvecs
+            ))
+        else:
+            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
+
+
+    def finish_steps(self, golds):
+        # Add a padding vector to the d_tokvecs gradient, so that missing
+        # values don't affect the real gradient.
+        d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
+        # Tells CUDA to block, so our async copies complete.
+        if self.cuda_stream is not None:
+            self.cuda_stream.synchronize()
+        for ids, d_vector, bp_vector in self.backprops:
+            d_state_features = bp_vector((d_vector, ids))
+            ids = ids.flatten()
+            d_state_features = d_state_features.reshape(
+                (ids.size, d_state_features.shape[2]))
+            self.ops.scatter_add(d_tokvecs, ids,
+                d_state_features)
+        # Padded -- see update()
+        self.bp_tokvecs(d_tokvecs[:-1])
+        return d_tokvecs
+
+NUMPY_OPS = NumpyOps()
+
+def step_forward(model: ParserStepModel, states, is_train):
+    token_ids = model.get_token_ids(states)
+    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
+    mask = None
+    if model.attrs["has_upper"]:
+        dropout_rate = model.attrs["dropout_rate"]
+        if is_train and dropout_rate > 0:
+            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
+            vector *= mask
+        scores, get_d_vector = model.vec2scores(vector, is_train)
+    else:
+        scores = NumpyOps().asarray(vector)
+        get_d_vector = lambda d_scores: d_scores
+    # If the class is unseen, make sure its score is minimum
+    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
+
+    def backprop_parser_step(d_scores):
+        # Zero vectors for unseen classes
+        d_scores *= model._class_mask
+        d_vector = get_d_vector(d_scores)
+        if mask is not None:
+            d_vector *= mask
+        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
+        return None
+    return scores, backprop_parser_step
+
+
+cdef class precompute_hiddens:
+    """Allow a model to be "primed" by pre-computing input features in bulk.
+
+    This is used for the parser, where we want to take a batch of documents,
+    and compute vectors for each (token, position) pair. These vectors can then
+    be reused, especially for beam-search.
+
+    Let's say we're using 12 features for each state, e.g. word at start of
+    buffer, three words on stack, their children, etc. In the normal arc-eager
+    system, a document of length N is processed in 2*N states. This means we'll
+    create 2*N*12 feature vectors --- but if we pre-compute, we only need
+    N*12 vector computations. The saving for beam-search is much better:
+    if we have a beam of k, we'll normally make 2*N*12*K computations --
+    so we can save the factor k. This also gives a nice CPU/GPU division:
+    we can do all our hard maths up front, packed into large multiplications,
+    and do the hard-to-program parsing on the CPU.
+    """
+    cdef readonly int nF, nO, nP
+    cdef bint _is_synchronized
+    cdef public object ops
+    cdef public object numpy_ops
+    cdef public object _cpu_ops
+    cdef np.ndarray _features
+    cdef np.ndarray _cached
+    cdef np.ndarray bias
+    cdef object _cuda_stream
+    cdef object _bp_hiddens
+    cdef object activation
+
+    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
+                 activation="maxout", train=False):
+        gpu_cached, bp_features = lower_model(tokvecs, train)
+        cdef np.ndarray cached
+        if not isinstance(gpu_cached, numpy.ndarray):
+            # Note the passing of cuda_stream here: it lets
+            # cupy make the copy asynchronously.
+            # We then have to block before first use.
+            cached = gpu_cached.get(stream=cuda_stream)
+        else:
+            cached = gpu_cached
+        if not isinstance(lower_model.get_param("b"), numpy.ndarray):
+            self.bias = lower_model.get_param("b").get(stream=cuda_stream)
+        else:
+            self.bias = lower_model.get_param("b")
+        self.nF = cached.shape[1]
+        if lower_model.has_dim("nP"):
+            self.nP = lower_model.get_dim("nP")
+        else:
+            self.nP = 1
+        self.nO = cached.shape[2]
+        self.ops = lower_model.ops
+        self.numpy_ops = NumpyOps()
+        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
+        assert activation in (None, "relu", "maxout")
+        self.activation = activation
+        self._is_synchronized = False
+        self._cuda_stream = cuda_stream
+        self._cached = cached
+        self._bp_hiddens = bp_features
+
+    cdef const float* get_feat_weights(self) except NULL:
+        if not self._is_synchronized and self._cuda_stream is not None:
+            self._cuda_stream.synchronize()
+            self._is_synchronized = True
+        return <float*>self._cached.data
+
+    def has_dim(self, name):
+        if name == "nF":
+            return self.nF if self.nF is not None else True
+        elif name == "nP":
+            return self.nP if self.nP is not None else True
+        elif name == "nO":
+            return self.nO if self.nO is not None else True
+        else:
+            return False
+
+    def get_dim(self, name):
+        if name == "nF":
+            return self.nF
+        elif name == "nP":
+            return self.nP
+        elif name == "nO":
+            return self.nO
+        else:
+            raise ValueError(Errors.E1033.format(name=name))
+
+    def set_dim(self, name, value):
+        if name == "nF":
+            self.nF = value
+        elif name == "nP":
+            self.nP = value
+        elif name == "nO":
+            self.nO = value
+        else:
+            raise ValueError(Errors.E1033.format(name=name))
+
+    def __call__(self, X, bint is_train):
+        if is_train:
+            return self.begin_update(X)
+        else:
+            return self.predict(X), lambda X: X
+
+    def predict(self, X):
+        return self.begin_update(X)[0]
+
+    def begin_update(self, token_ids):
+        cdef np.ndarray state_vector = numpy.zeros(
+            (token_ids.shape[0], self.nO, self.nP), dtype='f')
+        # This is tricky, but (assuming GPU available);
+        # - Input to forward on CPU
+        # - Output from forward on CPU
+        # - Input to backward on GPU!
+        # - Output from backward on GPU
+        bp_hiddens = self._bp_hiddens
+
+        cdef CBlas cblas = self._cpu_ops.cblas()
+
+        feat_weights = self.get_feat_weights()
+        cdef int[:, ::1] ids = token_ids
+        sum_state_features(cblas, <float*>state_vector.data,
+            feat_weights, &ids[0,0],
+            token_ids.shape[0], self.nF, self.nO*self.nP)
+        state_vector += self.bias
+        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
+
+        def backward(d_state_vector_ids):
+            d_state_vector, token_ids = d_state_vector_ids
+            d_state_vector = bp_nonlinearity(d_state_vector)
+            d_tokens = bp_hiddens((d_state_vector, token_ids))
+            return d_tokens
+        return state_vector, backward
+
+    def _nonlinearity(self, state_vector):
+        if self.activation == "maxout":
+            return self._maxout_nonlinearity(state_vector)
+        else:
+            return self._relu_nonlinearity(state_vector)
+
+    def _maxout_nonlinearity(self, state_vector):
+        state_vector, mask = self.numpy_ops.maxout(state_vector)
+        # We're outputting to CPU, but we need this variable on GPU for the
+        # backward pass.
+        mask = self.ops.asarray(mask)
+
+        def backprop_maxout(d_best):
+            return self.ops.backprop_maxout(d_best, mask, self.nP)
+        
+        return state_vector, backprop_maxout
+
+    def _relu_nonlinearity(self, state_vector):
+        state_vector = state_vector.reshape((state_vector.shape[0], -1))
+        mask = state_vector >= 0.
+        state_vector *= mask
+        # We're outputting to CPU, but we need this variable on GPU for the
+        # backward pass.
+        mask = self.ops.asarray(mask)
+
+        def backprop_relu(d_best):
+            d_best *= mask
+            return d_best.reshape((d_best.shape + (1,)))
+ 
+        return state_vector, backprop_relu
+
+cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
diff --git a/spacy/ml/tb_framework.pxd b/spacy/ml/tb_framework.pxd
deleted file mode 100644
index 965508519e8..00000000000
--- a/spacy/ml/tb_framework.pxd
+++ /dev/null
@@ -1,28 +0,0 @@
-from libc.stdint cimport int8_t
-
-
-cdef struct SizesC:
-    int states
-    int classes
-    int hiddens
-    int pieces
-    int feats
-    int embed_width
-    int tokens
-
-
-cdef struct WeightsC:
-    const float* feat_weights
-    const float* feat_bias
-    const float* hidden_bias
-    const float* hidden_weights
-    const int8_t* seen_mask
-
-
-cdef struct ActivationsC:
-    int* token_ids
-    float* unmaxed
-    float* hiddens
-    int* is_valid
-    int _curr_size
-    int _max_size
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
new file mode 100644
index 00000000000..ab4a969e24e
--- /dev/null
+++ b/spacy/ml/tb_framework.py
@@ -0,0 +1,50 @@
+from thinc.api import Model, noop
+from .parser_model import ParserStepModel
+from ..util import registry
+
+
+@registry.layers("spacy.TransitionModel.v1")
+def TransitionModel(
+    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
+):
+    """Set up a stepwise transition-based model"""
+    if upper is None:
+        has_upper = False
+        upper = noop()
+    else:
+        has_upper = True
+    # don't define nO for this object, because we can't dynamically change it
+    return Model(
+        name="parser_model",
+        forward=forward,
+        dims={"nI": tok2vec.maybe_get_dim("nI")},
+        layers=[tok2vec, lower, upper],
+        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
+        init=init,
+        attrs={
+            "has_upper": has_upper,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def forward(model, X, is_train):
+    step_model = ParserStepModel(
+        X,
+        model.layers,
+        unseen_classes=model.attrs["unseen_classes"],
+        train=is_train,
+        has_upper=model.attrs["has_upper"],
+    )
+
+    return step_model, step_model.finish_steps
+
+
+def init(model, X=None, Y=None):
+    model.get_ref("tok2vec").initialize(X=X)
+    lower = model.get_ref("lower")
+    lower.initialize()
+    if model.attrs["has_upper"]:
+        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
+        model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
deleted file mode 100644
index e497643f0cd..00000000000
--- a/spacy/ml/tb_framework.pyx
+++ /dev/null
@@ -1,639 +0,0 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, cast
-
-from libc.stdlib cimport calloc, free, realloc
-from libc.string cimport memcpy, memset
-from libcpp.vector cimport vector
-
-import numpy
-
-cimport numpy as np
-
-from thinc.api import (
-    Linear,
-    Model,
-    NumpyOps,
-    chain,
-    glorot_uniform_init,
-    list2array,
-    normal_init,
-    uniform_init,
-    zero_init,
-)
-
-from thinc.backends.cblas cimport CBlas, saxpy, sgemm
-
-from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
-
-from ..errors import Errors
-from ..pipeline._parser_internals import _beam_utils
-from ..pipeline._parser_internals.batch import GreedyBatch
-
-from ..pipeline._parser_internals._parser_utils cimport arg_max
-from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
-from ..pipeline._parser_internals.transition_system cimport (
-    TransitionSystem,
-    c_apply_actions,
-    c_transition_batch,
-)
-
-from ..tokens.doc import Doc
-from ..util import registry
-
-State = Any  # TODO
-
-
-@registry.layers("spacy.TransitionModel.v2")
-def TransitionModel(
-    *,
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    beam_width: int = 1,
-    beam_density: float = 0.0,
-    state_tokens: int,
-    hidden_width: int,
-    maxout_pieces: int,
-    nO: Optional[int] = None,
-    unseen_classes=set(),
-) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
-    """Set up a transition-based parsing model, using a maxout hidden
-    layer and a linear output layer.
-    """
-    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
-    tok2vec_projected.set_dim("nO", hidden_width)
-
-    # FIXME: we use `output` as a container for the output layer's
-    # weights and biases. Thinc optimizers cannot handle resizing
-    # of parameters. So, when the parser model is resized, we
-    # construct a new `output` layer, which has a different key in
-    # the optimizer. Once the optimizer supports parameter resizing,
-    # we can replace the `output` layer by `output_W` and `output_b`
-    # parameters in this model.
-    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
-
-    return Model(
-        name="parser_model",
-        forward=forward,
-        init=init,
-        layers=[tok2vec_projected, output],
-        refs={
-            "tok2vec": tok2vec_projected,
-            "output": output,
-        },
-        params={
-            "hidden_W": None,  # Floats2d W for the hidden layer
-            "hidden_b": None,  # Floats1d bias for the hidden layer
-            "hidden_pad": None,  # Floats1d padding for the hidden layer
-        },
-        dims={
-            "nO": None,  # Output size
-            "nP": maxout_pieces,
-            "nH": hidden_width,
-            "nI": tok2vec_projected.maybe_get_dim("nO"),
-            "nF": state_tokens,
-        },
-        attrs={
-            "beam_width": beam_width,
-            "beam_density": beam_density,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def resize_output(model: Model, new_nO: int) -> Model:
-    old_nO = model.maybe_get_dim("nO")
-    output = model.get_ref("output")
-    if old_nO is None:
-        model.set_dim("nO", new_nO)
-        output.set_dim("nO", new_nO)
-        output.initialize()
-        return model
-    elif new_nO <= old_nO:
-        return model
-    elif output.has_param("W"):
-        nH = model.get_dim("nH")
-        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
-        new_output.initialize()
-        new_W = new_output.get_param("W")
-        new_b = new_output.get_param("b")
-        old_W = output.get_param("W")
-        old_b = output.get_param("b")
-        new_W[:old_nO] = old_W  # type: ignore
-        new_b[:old_nO] = old_b  # type: ignore
-        for i in range(old_nO, new_nO):
-            model.attrs["unseen_classes"].add(i)
-        model.layers[-1] = new_output
-        model.set_ref("output", new_output)
-    # TODO: Avoid this private intrusion
-    model._dims["nO"] = new_nO
-    return model
-
-
-def init(
-    model,
-    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
-    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
-):
-    if X is not None:
-        docs, _ = X
-        model.get_ref("tok2vec").initialize(X=docs)
-    else:
-        model.get_ref("tok2vec").initialize()
-    inferred_nO = _infer_nO(Y)
-    if inferred_nO is not None:
-        current_nO = model.maybe_get_dim("nO")
-        if current_nO is None or current_nO != inferred_nO:
-            model.attrs["resize_output"](model, inferred_nO)
-    nP = model.get_dim("nP")
-    nH = model.get_dim("nH")
-    nI = model.get_dim("nI")
-    nF = model.get_dim("nF")
-    ops = model.ops
-
-    Wl = ops.alloc2f(nH * nP, nF * nI)
-    bl = ops.alloc1f(nH * nP)
-    padl = ops.alloc1f(nI)
-    # Wl = zero_init(ops, Wl.shape)
-    Wl = glorot_uniform_init(ops, Wl.shape)
-    padl = uniform_init(ops, padl.shape)  # type: ignore
-    # TODO: Experiment with whether better to initialize output_W
-    model.set_param("hidden_W", Wl)
-    model.set_param("hidden_b", bl)
-    model.set_param("hidden_pad", padl)
-    # model = _lsuv_init(model)
-    return model
-
-
-class TransitionModelInputs:
-    """
-    Input to transition model.
-    """
-
-    # dataclass annotation is not yet supported in Cython 0.29.x,
-    # so, we'll do something close to it.
-
-    actions: Optional[List[Ints1d]]
-    docs: List[Doc]
-    max_moves: int
-    moves: TransitionSystem
-    states: Optional[List[State]]
-
-    __slots__ = [
-        "actions",
-        "docs",
-        "max_moves",
-        "moves",
-        "states",
-    ]
-
-    def __init__(
-        self,
-        docs: List[Doc],
-        moves: TransitionSystem,
-        actions: Optional[List[Ints1d]] = None,
-        max_moves: int = 0,
-        states: Optional[List[State]] = None,
-    ):
-        """
-        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
-        docs (List[Doc]): Docs to predict transition sequences for.
-        max_moves: (int): the maximum number of moves to apply, values less
-            than 1 will apply moves to states until they are final states.
-        moves (TransitionSystem): the transition system to use when predicting
-            the transition sequences.
-        states (Optional[List[States]]): the initial states to predict the
-            transition sequences for. When absent, the initial states are
-            initialized from the provided Docs.
-        """
-        self.actions = actions
-        self.docs = docs
-        self.moves = moves
-        self.max_moves = max_moves
-        self.states = states
-
-
-def forward(model, inputs: TransitionModelInputs, is_train: bool):
-    docs = inputs.docs
-    moves = inputs.moves
-    actions = inputs.actions
-
-    beam_width = model.attrs["beam_width"]
-    hidden_pad = model.get_param("hidden_pad")
-    tok2vec = model.get_ref("tok2vec")
-
-    states = moves.init_batch(docs) if inputs.states is None else inputs.states
-    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
-    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
-    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
-    seen_mask = _get_seen_mask(model)
-
-    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
-        # Note: max_moves is only used during training, so we don't need to
-        #       pass it to the greedy inference path.
-        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
-    else:
-        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
-                                 feats, backprop_feats, seen_mask, is_train, actions=actions,
-                                 max_moves=inputs.max_moves)
-
-
-def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
-                        np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None):
-    cdef vector[StateC*] c_states
-    cdef StateClass state
-    for state in states:
-        if not state.is_final():
-            c_states.push_back(state.c)
-    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
-    # Precomputed features have rows for each token, plus one for padding.
-    cdef int n_tokens = feats.shape[0] - 1
-    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
-    cdef CBlas cblas = model.ops.cblas()
-    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
-
-    def backprop(dY):
-        raise ValueError(Errors.E4004)
-
-    return (states, scores), backprop
-
-
-cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
-                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i
-    cdef vector[StateC *] unfinished
-    cdef ActivationsC activations = _alloc_activations(sizes)
-    cdef np.ndarray step_scores
-    cdef np.ndarray step_actions
-
-    scores = []
-    while sizes.states >= 1:
-        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
-        step_actions = actions[0] if actions is not None else None
-        with nogil:
-            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
-            if actions is None:
-                # Validate actions, argmax, take action.
-                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
-                                   sizes.states)
-            else:
-                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
-            for i in range(sizes.states):
-                if not states[i].is_final():
-                    unfinished.push_back(states[i])
-            for i in range(unfinished.size()):
-                states[i] = unfinished[i]
-        sizes.states = unfinished.size()
-        scores.append(step_scores)
-        unfinished.clear()
-        actions = actions[1:] if actions is not None else None
-    _free_activations(&activations)
-
-    return scores
-
-
-def _forward_fallback(
-    model: Model,
-    moves: TransitionSystem,
-    states: List[StateClass],
-    tokvecs, backprop_tok2vec,
-    feats,
-    backprop_feats,
-    seen_mask,
-    is_train: bool,
-    actions: Optional[List[Ints1d]] = None,
-    max_moves: int = 0,
-):
-    nF = model.get_dim("nF")
-    output = model.get_ref("output")
-    hidden_b = model.get_param("hidden_b")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-
-    beam_width = model.attrs["beam_width"]
-    beam_density = model.attrs["beam_density"]
-
-    ops = model.ops
-
-    all_ids = []
-    all_which = []
-    all_statevecs = []
-    all_scores = []
-    if beam_width == 1:
-        batch = GreedyBatch(moves, states, None)
-    else:
-        batch = _beam_utils.BeamBatch(
-            moves, states, None, width=beam_width, density=beam_density
-        )
-    arange = ops.xp.arange(nF)
-    n_moves = 0
-    while not batch.is_done:
-        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
-        for i, state in enumerate(batch.get_unfinished_states()):
-            state.set_context_tokens(ids, i, nF)
-        # Sum the state features, add the bias and apply the activation (maxout)
-        # to create the state vectors.
-        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
-        preacts2f += hidden_b
-        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
-        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
-        statevecs, which = ops.maxout(preacts)
-        # We don't use output's backprop, since we want to backprop for
-        # all states at once, rather than a single state.
-        scores = output.predict(statevecs)
-        scores[:, seen_mask] = ops.xp.nanmin(scores)
-        # Transition the states, filtering out any that are finished.
-        cpu_scores = ops.to_numpy(scores)
-        if actions is None:
-            batch.advance(cpu_scores)
-        else:
-            batch.advance_with_actions(actions[0])
-            actions = actions[1:]
-        all_scores.append(scores)
-        if is_train:
-            # Remember intermediate results for the backprop.
-            all_ids.append(ids)
-            all_statevecs.append(statevecs)
-            all_which.append(which)
-        if n_moves >= max_moves >= 1:
-            break
-        n_moves += 1
-
-    def backprop_parser(d_states_d_scores):
-        ids = ops.xp.vstack(all_ids)
-        which = ops.xp.vstack(all_which)
-        statevecs = ops.xp.vstack(all_statevecs)
-        _, d_scores = d_states_d_scores
-        if model.attrs.get("unseen_classes"):
-            # If we have a negative gradient (i.e. the probability should
-            # increase) on any classes we filtered out as unseen, mark
-            # them as seen.
-            for clas in set(model.attrs["unseen_classes"]):
-                if (d_scores[:, clas] < 0).any():
-                    model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False  # no-cython-lint
-        # Calculate the gradients for the parameters of the output layer.
-        # The weight gemm is (nS, nO) @ (nS, nH).T
-        output.inc_grad("b", d_scores.sum(axis=0))
-        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
-        # Now calculate d_statevecs, by backproping through the output linear layer.
-        # This gemm is (nS, nO) @ (nO, nH)
-        output_W = output.get_param("W")
-        d_statevecs = ops.gemm(d_scores, output_W)
-        # Backprop through the maxout activation
-        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
-        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
-        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
-        # We don't need to backprop the summation, because we pass back the IDs instead
-        d_state_features = backprop_feats((d_preacts2f, ids))
-        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
-        ops.scatter_add(d_tokvecs, ids, d_state_features)
-        model.inc_grad("hidden_pad", d_tokvecs[-1])
-        return (backprop_tok2vec(d_tokvecs[:-1]), None)
-
-    return (list(batch), all_scores), backprop_parser
-
-
-def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
-    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
-    for class_ in model.attrs.get("unseen_classes", set()):
-        mask[class_] = True
-    return mask
-
-
-def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
-    W: Floats2d = model.get_param("hidden_W")
-    nF = model.get_dim("nF")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
-    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
-    W3f = W3f.transpose((1, 0, 2))
-    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
-    assert X.shape == (X.shape[0], nI), X.shape
-    Yf_ = model.ops.gemm(X, W2f, trans2=True)
-    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
-
-    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nH, nP), and get back:
-        # (nB, nH, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nH, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        dXf = model.ops.gemm(dY, W)
-        Xf = X[ids].reshape((ids.shape[0], -1))
-        dW = model.ops.gemm(dY, Xf, trans1=True)
-        model.inc_grad("hidden_W", dW)
-        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
-
-    return Yf, backward
-
-
-def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
-    if Y is None:
-        return None
-    _, scores = Y
-    if len(scores) == 0:
-        return None
-    assert scores[0].shape[0] >= 1
-    assert len(scores[0].shape) == 2
-    return scores[0].shape[1]
-
-
-def _lsuv_init(model: Model):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    W = model.maybe_get_param("hidden_W")
-    if W is not None and W.any():
-        return
-
-    nF = model.get_dim("nF")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nH, nP, nI)
-    b = model.ops.alloc2f(nH, nP)
-    pad = model.ops.alloc4f(1, nF, nH, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc_f((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc_f((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
-        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
-        vectors3f += b
-        return model.ops.maxout(vectors3f)[0]
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = cast(Floats4d, model.get_param("hidden_W").copy())
-    b = cast(Floats2d, model.get_param("hidden_b").copy())
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("hidden_W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("hidden_b", b)
-        else:
-            break
-    return model
-
-
-cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
-    output = model.get_ref("output")
-    cdef np.ndarray hidden_b = model.get_param("hidden_b")
-    cdef np.ndarray output_W = output.get_param("W")
-    cdef np.ndarray output_b = output.get_param("b")
-
-    cdef WeightsC weights
-    weights.feat_weights = feats
-    weights.feat_bias = <const float*>hidden_b.data
-    weights.hidden_weights = <const float *> output_W.data
-    weights.hidden_bias = <const float *> output_b.data
-    weights.seen_mask = <const int8_t*> seen_mask.data
-
-    return weights
-
-
-cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
-    cdef SizesC sizes
-    sizes.states = batch_size
-    sizes.classes = model.get_dim("nO")
-    sizes.hiddens = model.get_dim("nH")
-    sizes.pieces = model.get_dim("nP")
-    sizes.feats = model.get_dim("nF")
-    sizes.embed_width = model.get_dim("nI")
-    sizes.tokens = tokens
-    return sizes
-
-
-cdef ActivationsC _alloc_activations(SizesC n) nogil:
-    cdef ActivationsC A
-    memset(&A, 0, sizeof(A))
-    _resize_activations(&A, n)
-    return A
-
-
-cdef void _free_activations(const ActivationsC* A) nogil:
-    free(A.token_ids)
-    free(A.unmaxed)
-    free(A.hiddens)
-    free(A.is_valid)
-
-
-cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
-    if n.states <= A._max_size:
-        A._curr_size = n.states
-        return
-    if A._max_size == 0:
-        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
-        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
-        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    else:
-        A.token_ids = <int*>realloc(A.token_ids,
-                                    n.states * n.feats * sizeof(A.token_ids[0]))
-        A.unmaxed = <float*>realloc(A.unmaxed,
-                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>realloc(A.hiddens,
-                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
-        A.is_valid = <int*>realloc(A.is_valid,
-                                   n.states * n.classes * sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    A._curr_size = n.states
-
-
-cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
-    _resize_activations(A, n)
-    for i in range(n.states):
-        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
-    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
-    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
-    for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
-        for j in range(n.hiddens):
-            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = arg_max(&A.unmaxed[index], n.pieces)
-            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
-    if W.hidden_weights == NULL:
-        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
-    else:
-        # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-                     1.0, <const float *>A.hiddens, n.hiddens,
-                     <const float *>W.hidden_weights, n.hiddens,
-                     0.0, scores, n.classes)
-        # Add bias
-        for i in range(n.states):
-            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
-    # Set unseen classes to minimum value
-    i = 0
-    min_ = scores[0]
-    for i in range(1, n.states * n.classes):
-        if scores[i] < min_:
-            min_ = scores[i]
-    for i in range(n.states):
-        for j in range(n.classes):
-            if W.seen_mask[j]:
-                scores[i*n.classes+j] = min_
-
-
-cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
-                              const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f
-    cdef const float* feature
-    cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces  # no-cython-lint
-    cdef int F = n.feats
-    cdef int T = n.tokens
-    padding = cached + (T * F * O)
-    cdef int id_stride = F*O
-    cdef float one = 1.
-    for b in range(B):
-        for f in range(F):
-            if token_ids[f] < 0:
-                feature = &padding[f*O]
-            else:
-                idx = token_ids[f] * id_stride + f*O
-                feature = &cached[idx]
-            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
-        token_ids += F
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index 7c546752d80..273cc6c1078 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -8,8 +8,6 @@ from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
-
-from .batch cimport Batch
 from .search cimport Beam, MaxViolation
 
 from .search import MaxViolation
@@ -31,7 +29,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1:
     return state.is_final()
 
 
-cdef class BeamBatch(Batch):
+cdef class BeamBatch(object):
     cdef public TransitionSystem moves
     cdef public object states
     cdef public object docs
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pxd b/spacy/pipeline/_parser_internals/_parser_utils.pxd
deleted file mode 100644
index 7fee05bad60..00000000000
--- a/spacy/pipeline/_parser_internals/_parser_utils.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef int arg_max(const float* scores, const int n_classes) nogil
-cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pyx b/spacy/pipeline/_parser_internals/_parser_utils.pyx
deleted file mode 100644
index 582756bf5be..00000000000
--- a/spacy/pipeline/_parser_internals/_parser_utils.pyx
+++ /dev/null
@@ -1,22 +0,0 @@
-# cython: infer_types=True
-
-cdef inline int arg_max(const float* scores, const int n_classes) nogil:
-    if n_classes == 2:
-        return 0 if scores[0] > scores[1] else 1
-    cdef int i
-    cdef int best = 0
-    cdef float mode = scores[0]
-    for i in range(1, n_classes):
-        if scores[i] > mode:
-            mode = scores[i]
-            best = i
-    return best
-
-
-cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
-    cdef int best = -1
-    for i in range(n):
-        if is_valid[i] >= 1:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 1c61ac271d8..04274ce8af1 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -7,6 +7,8 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.set cimport set
+from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
@@ -26,7 +28,7 @@ cdef struct ArcC:
 
 
 cdef cppclass StateC:
-    vector[int] _heads
+    int* _heads
     const TokenC* _sent
     vector[int] _stack
     vector[int] _rebuffer
@@ -34,34 +36,31 @@ cdef cppclass StateC:
     unordered_map[int, vector[ArcC]] _left_arcs
     unordered_map[int, vector[ArcC]] _right_arcs
     vector[libcpp.bool] _unshiftable
-    vector[int] history
     set[int] _sent_starts
     TokenC _empty_token
     int length
     int offset
     int _b_i
 
-    __init__(const TokenC* sent, int length) nogil except +:
-        this._heads.resize(length, -1)
-        this._unshiftable.resize(length, False)
-
-        # Reserve memory ahead of time to minimize allocations during parsing.
-        # The initial capacity set here ideally reflects the expected average-case/majority usage.
-        cdef int init_capacity = 32
-        this._stack.reserve(init_capacity)
-        this._rebuffer.reserve(init_capacity)
-        this._ents.reserve(init_capacity)
-        this._left_arcs.reserve(init_capacity)
-        this._right_arcs.reserve(init_capacity)
-        this.history.reserve(init_capacity)
-
+    __init__(const TokenC* sent, int length) nogil:
         this._sent = sent
+        this._heads = <int*>calloc(length, sizeof(int))
+        if not (this._sent and this._heads):
+            with gil:
+                PyErr_SetFromErrno(MemoryError)
+                PyErr_CheckSignals()
         this.offset = 0
         this.length = length
         this._b_i = 0
+        for i in range(length):
+            this._heads[i] = -1
+            this._unshiftable.push_back(0)
         memset(&this._empty_token, 0, sizeof(TokenC))
         this._empty_token.lex = &EMPTY_LEXEME
 
+    __dealloc__():
+        free(this._heads)
+
     void set_context_tokens(int* ids, int n) nogil:
         cdef int i, j
         if n == 1:
@@ -134,20 +133,19 @@ cdef cppclass StateC:
                 ids[i] = -1
 
     int S(int i) nogil const:
-        cdef int stack_size = this._stack.size()
-        if i >= stack_size or i < 0:
+        if i >= this._stack.size():
             return -1
-        else:
-            return this._stack[stack_size - (i+1)]
+        elif i < 0:
+            return -1
+        return this._stack.at(this._stack.size() - (i+1))
 
     int B(int i) nogil const:
-        cdef int buf_size = this._rebuffer.size()
         if i < 0:
             return -1
-        elif i < buf_size:
-            return this._rebuffer[buf_size - (i+1)]
+        elif i < this._rebuffer.size():
+            return this._rebuffer.at(this._rebuffer.size() - (i+1))
         else:
-            b_i = this._b_i + (i - buf_size)
+            b_i = this._b_i + (i - this._rebuffer.size())
             if b_i >= this.length:
                 return -1
             else:
@@ -246,7 +244,7 @@ cdef cppclass StateC:
             return 0
         elif this._sent[word].sent_start == 1:
             return 1
-        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
+        elif this._sent_starts.count(word) >= 1:
             return 1
         else:
             return 0
@@ -330,7 +328,7 @@ cdef cppclass StateC:
         if item >= this._unshiftable.size():
             return 0
         else:
-            return this._unshiftable[item]
+            return this._unshiftable.at(item)
 
     void set_reshiftable(int item) nogil:
         if item < this._unshiftable.size():
@@ -350,9 +348,6 @@ cdef cppclass StateC:
         this._heads[child] = head
 
     void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
-        cdef vector[ArcC]* arcs
-        cdef ArcC* arc
-
         arcs_it = heads_arcs.find(h_i)
         if arcs_it == heads_arcs.end():
             return
@@ -361,12 +356,12 @@ cdef cppclass StateC:
         if arcs.size() == 0:
             return
 
-        arc = &arcs.back()
+        arc = arcs.back()
         if arc.head == h_i and arc.child == c_i:
             arcs.pop_back()
         else:
             for i in range(arcs.size()-1):
-                arc = &deref(arcs)[i]
+                arc = arcs.at(i)
                 if arc.head == h_i and arc.child == c_i:
                     arc.head = -1
                     arc.child = -1
@@ -406,11 +401,10 @@ cdef cppclass StateC:
         this._rebuffer = src._rebuffer
         this._sent_starts = src._sent_starts
         this._unshiftable = src._unshiftable
-        this._heads = src._heads
+        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
         this._ents = src._ents
         this._left_arcs = src._left_arcs
         this._right_arcs = src._right_arcs
         this._b_i = src._b_i
         this.offset = src.offset
         this._empty_token = src._empty_token
-        this.history = src.history
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 08f60b2634b..6ffceae10d3 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -779,8 +779,6 @@ cdef class ArcEager(TransitionSystem):
         return list(arcs)
 
     def has_gold(self, Example eg, start=0, end=None):
-        if end is not None and end < 0:
-            end = None
         for word in eg.y[start:end]:
             if word.dep != 0:
                 return True
@@ -865,7 +863,6 @@ cdef class ArcEager(TransitionSystem):
                             state.print_state()
                         )))
                     action.do(state.c, action.label)
-                    state.c.history.push_back(i)
                     break
             else:
                 failed = False
diff --git a/spacy/pipeline/_parser_internals/batch.pxd b/spacy/pipeline/_parser_internals/batch.pxd
deleted file mode 100644
index 60734e549aa..00000000000
--- a/spacy/pipeline/_parser_internals/batch.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef class Batch:
-    pass
diff --git a/spacy/pipeline/_parser_internals/batch.pyx b/spacy/pipeline/_parser_internals/batch.pyx
deleted file mode 100644
index 91073b52e68..00000000000
--- a/spacy/pipeline/_parser_internals/batch.pyx
+++ /dev/null
@@ -1,52 +0,0 @@
-from typing import Any
-
-TransitionSystem = Any  # TODO
-
-cdef class Batch:
-    def advance(self, scores):
-        raise NotImplementedError
-
-    def get_states(self):
-        raise NotImplementedError
-
-    @property
-    def is_done(self):
-        raise NotImplementedError
-
-    def get_unfinished_states(self):
-        raise NotImplementedError
-
-    def __getitem__(self, i):
-        raise NotImplementedError
-
-    def __len__(self):
-        raise NotImplementedError
-
-
-class GreedyBatch(Batch):
-    def __init__(self, moves: TransitionSystem, states, golds):
-        self._moves = moves
-        self._states = states
-        self._next_states = [s for s in states if not s.is_final()]
-
-    def advance(self, scores):
-        self._next_states = self._moves.transition_states(self._next_states, scores)
-
-    def advance_with_actions(self, actions):
-        self._next_states = self._moves.apply_actions(self._next_states, actions)
-
-    def get_states(self):
-        return self._states
-
-    @property
-    def is_done(self):
-        return all(s.is_final() for s in self._states)
-
-    def get_unfinished_states(self):
-        return [st for st in self._states if not st.is_final()]
-
-    def __getitem__(self, i):
-        return self._states[i]
-
-    def __len__(self):
-        return len(self._states)
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 3a352f51ff5..0b9980ddbf2 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -159,7 +159,7 @@ cdef class BiluoPushDown(TransitionSystem):
             if token.ent_type:
                 labels.add(token.ent_type_)
         return labels
-
+    
     def move_name(self, int move, attr_t label):
         if move == OUT:
             return 'O'
@@ -309,8 +309,6 @@ cdef class BiluoPushDown(TransitionSystem):
             for span in eg.y.spans.get(neg_key, []):
                 if span.start >= start and span.end <= end:
                     return True
-        if end is not None and end < 0:
-            end = None
         for word in eg.y[start:end]:
             if word.ent_iob != 0:
                 return True
@@ -646,7 +644,7 @@ cdef class Unit:
                 cost += 1
                 break
         return cost
-
+ 
 
 
 cdef class Out:
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index c2a0d22956a..f25408a13ba 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -21,10 +21,6 @@ cdef class StateClass:
         if self._borrowed != 1:
             del self.c
 
-    @property
-    def history(self):
-        return list(self.c.history)
-
     @property
     def stack(self):
         return [self.S(i) for i in range(self.c.stack_depth())]
@@ -181,6 +177,3 @@ cdef class StateClass:
 
     def clone(self, StateClass src):
         self.c.clone(src.c)
-
-    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
-        self.c.set_context_tokens(&output[row, 0], n_feats)
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 08baed932ba..04cd10d8864 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -57,10 +57,3 @@ cdef class TransitionSystem:
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        const StateC* state, gold) except -1
-
-
-cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-                          int batch_size) nogil
-
-cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-                             int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index 50b155bf9bb..485ce7c10bd 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -3,18 +3,12 @@
 from __future__ import print_function
 
 from cymem.cymem cimport Pool
-from libc.stdlib cimport calloc, free
-from libcpp.vector cimport vector
 
 from collections import Counter
 
 import srsly
 
 from ...structs cimport TokenC
-from ...tokens.doc cimport Doc
-from ...typedefs cimport attr_t, weight_t
-from . cimport _beam_utils
-from ._parser_utils cimport arg_max_if_valid
 from .stateclass cimport StateClass
 
 from ... import util
@@ -79,18 +73,7 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
-    def follow_history(self, doc, history):
-        cdef int clas
-        cdef StateClass state = StateClass(doc)
-        for clas in history:
-            action = self.c[clas]
-            action.do(state.c, action.label)
-            state.c.history.push_back(clas)
-        return state
-
     def get_oracle_sequence(self, Example example, _debug=False):
-        if not self.has_gold(example):
-            return []
         states, golds, _ = self.init_gold_batch([example])
         if not states:
             return []
@@ -102,8 +85,6 @@ cdef class TransitionSystem:
             return self.get_oracle_sequence_from_state(state, gold)
 
     def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
-        if state.is_final():
-            return []
         cdef Pool mem = Pool()
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         assert self.n_moves > 0
@@ -129,7 +110,6 @@ cdef class TransitionSystem:
                             "S0 head?", str(state.has_head(state.S(0))),
                         )))
                     action.do(state.c, action.label)
-                    state.c.history.push_back(i)
                     break
             else:
                 if _debug:
@@ -157,28 +137,6 @@ cdef class TransitionSystem:
             raise ValueError(Errors.E170.format(name=name))
         action = self.lookup_transition(name)
         action.do(state.c, action.label)
-        state.c.history.push_back(action.clas)
-
-    def apply_actions(self, states, const int[::1] actions):
-        assert len(states) == actions.shape[0]
-        cdef StateClass state
-        cdef vector[StateC*] c_states
-        c_states.resize(len(states))
-        cdef int i
-        for (i, state) in enumerate(states):
-            c_states[i] = state.c
-        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
-        return [state for state in states if not state.c.is_final()]
-
-    def transition_states(self, states, float[:, ::1] scores):
-        assert len(states) == scores.shape[0]
-        cdef StateClass state
-        cdef float* c_scores = &scores[0, 0]
-        cdef vector[StateC*] c_states
-        for state in states:
-            c_states.push_back(state.c)
-        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
-        return [state for state in states if not state.c.is_final()]
 
     cdef Transition lookup_transition(self, object name) except *:
         raise NotImplementedError
@@ -291,34 +249,3 @@ cdef class TransitionSystem:
             self.cfg.update(msg['cfg'])
         self.initialize_actions(labels)
         return self
-
-
-cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-                          int batch_size) nogil:
-    cdef int i
-    cdef Transition action
-    cdef StateC* state
-    for i in range(batch_size):
-        state = states[i]
-        action = moves.c[actions[i]]
-        action.do(state, action.label)
-        state.history.push_back(action.clas)
-
-
-cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-                             int nr_class, int batch_size) nogil:
-    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
-    cdef int i, guess
-    cdef Transition action
-    for i in range(batch_size):
-        moves.set_valid(is_valid, states[i])
-        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
-        if guess == -1:
-            # This shouldn't happen, but it's hard to raise an error here,
-            # and we don't want to infinite loop. So, force to end state.
-            states[i].force_final()
-        else:
-            action = moves.c[guess]
-            action.do(states[i], action.label)
-            states[i].history.push_back(guess)
-    free(is_valid)
diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.pyx
similarity index 96%
rename from spacy/pipeline/dep_parser.py
rename to spacy/pipeline/dep_parser.pyx
index b4961487b83..3e59deaae4f 100644
--- a/spacy/pipeline/dep_parser.py
+++ b/spacy/pipeline/dep_parser.pyx
@@ -4,6 +4,10 @@
 
 from thinc.api import Config, Model
 
+from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser cimport Parser
+from ._parser_internals.arc_eager cimport ArcEager
+
 from ..language import Language
 from ..scorer import Scorer
 from ..training import remove_bilu_prefix
@@ -17,11 +21,12 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -121,7 +126,6 @@ def make_parser(
         scorer=scorer,
     )
 
-
 @Language.factory(
     "beam_parser",
     assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
@@ -227,7 +231,6 @@ def parser_score(examples, **kwargs):
 
     DOCS: https://spacy.io/api/dependencyparser#score
     """
-
     def has_sents(doc):
         return doc.has_annotation("SENT_START")
 
@@ -235,11 +238,8 @@ def dep_getter(token, attr):
         dep = getattr(token, attr)
         dep = token.vocab.strings.as_string(dep).lower()
         return dep
-
     results = {}
-    results.update(
-        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
-    )
+    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
     kwargs.setdefault("getter", dep_getter)
     kwargs.setdefault("ignore_labels", ("p", "punct"))
     results.update(Scorer.score_deps(examples, "dep", **kwargs))
@@ -252,12 +252,11 @@ def make_parser_scorer():
     return parser_score
 
 
-class DependencyParser(Parser):
+cdef class DependencyParser(Parser):
     """Pipeline component for dependency parsing.
 
     DOCS: https://spacy.io/api/dependencyparser
     """
-
     TransitionSystem = ArcEager
 
     def __init__(
@@ -277,7 +276,8 @@ def __init__(
         incorrect_spans_key=None,
         scorer=parser_score,
     ):
-        """Create a DependencyParser."""
+        """Create a DependencyParser.
+        """
         super().__init__(
             vocab,
             model,
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.pyx
similarity index 93%
rename from spacy/pipeline/ner.py
rename to spacy/pipeline/ner.pyx
index 1c7cf151385..c73ec5c528a 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.pyx
@@ -10,15 +10,22 @@
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
+from .transition_parser cimport Parser
+from ._parser_internals.ner cimport BiluoPushDown
+from ..language import Language
+from ..scorer import get_ner_prf, PRFScore
+from ..util import registry
+from ..training import remove_bilu_prefix
+
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -43,12 +50,8 @@
         "incorrect_spans_key": None,
         "scorer": {"@scorers": "spacy.ner_scorer.v1"},
     },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+
 )
 def make_ner(
     nlp: Language,
@@ -101,7 +104,6 @@ def make_ner(
         scorer=scorer,
     )
 
-
 @Language.factory(
     "beam_ner",
     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
@@ -115,12 +117,7 @@ def make_ner(
         "incorrect_spans_key": None,
         "scorer": None,
     },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 )
 def make_beam_ner(
     nlp: Language,
@@ -194,12 +191,11 @@ def make_ner_scorer():
     return ner_score
 
 
-class EntityRecognizer(Parser):
+cdef class EntityRecognizer(Parser):
     """Pipeline component for named entity recognition.
 
     DOCS: https://spacy.io/api/entityrecognizer
     """
-
     TransitionSystem = BiluoPushDown
 
     def __init__(
@@ -217,14 +213,15 @@ def __init__(
         incorrect_spans_key=None,
         scorer=ner_score,
     ):
-        """Create an EntityRecognizer."""
+        """Create an EntityRecognizer.
+        """
         super().__init__(
             vocab,
             model,
             name,
             moves,
             update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,  # not relevant for NER
+            min_action_freq=1,   # not relevant for NER
             learn_tokens=False,  # not relevant for NER
             beam_width=beam_width,
             beam_density=beam_density,
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
new file mode 100644
index 00000000000..f20e69a6e56
--- /dev/null
+++ b/spacy/pipeline/transition_parser.pxd
@@ -0,0 +1,21 @@
+from cymem.cymem cimport Pool
+from thinc.backends.cblas cimport CBlas
+
+from ..vocab cimport Vocab
+from .trainable_pipe cimport TrainablePipe
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ._parser_internals._state cimport StateC
+from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
+
+
+cdef class Parser(TrainablePipe):
+    cdef public object _rehearsal_model
+    cdef readonly TransitionSystem moves
+    cdef public object _multitasks
+    cdef object _cpu_ops
+
+    cdef void _parseC(self, CBlas cblas, StateC** states,
+            WeightsC weights, SizesC sizes) nogil
+
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
+            int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 99970b3fe93..4290420c788 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,16 +1,21 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
-
 from typing import Dict, Iterable, List, Optional, Tuple
-
-cimport numpy as np
 from cymem.cymem cimport Pool
-
-import contextlib
-import random
+cimport numpy as np
 from itertools import islice
+from libcpp.vector cimport vector
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free
+import random
 
+import srsly
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
+from thinc.api import chain, softmax_activation, use_ops
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d
+import numpy.random
 import numpy
 import numpy.random
 import srsly
@@ -24,7 +29,16 @@ from thinc.api import (
 )
 from thinc.types import Floats2d, Ints1d
 
-from ..ml.tb_framework import TransitionModelInputs
+from ._parser_internals.stateclass cimport StateClass
+from ._parser_internals.search cimport Beam
+from ..ml.parser_model cimport alloc_activations, free_activations
+from ..ml.parser_model cimport predict_states, arg_max_if_valid
+from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
+from ..ml.parser_model cimport get_c_weights, get_c_sizes
+from ..tokens.doc cimport Doc
+from .trainable_pipe import TrainablePipe
+from ._parser_internals cimport _beam_utils
+from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
 from ..typedefs cimport weight_t
@@ -52,7 +66,7 @@ cdef extern from "<algorithm>" namespace "std" nogil:
 NUMPY_OPS = NumpyOps()
 
 
-class Parser(TrainablePipe):
+cdef class Parser(TrainablePipe):
     """
     Base class of the DependencyParser and EntityRecognizer.
     """
@@ -152,9 +166,8 @@ class Parser(TrainablePipe):
     @property
     def move_names(self):
         names = []
-        cdef TransitionSystem moves = self.moves
         for i in range(self.moves.n_moves):
-            name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
+            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
             # Explicitly removing the internal "U-" token used for blocking entities
             if name != "U-":
                 names.append(name)
@@ -261,6 +274,15 @@ class Parser(TrainablePipe):
 
         student_docs = [eg.predicted for eg in examples]
 
+        teacher_step_model = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_step_model, backprop_tok2vec = self.model.begin_update(student_docs)
+
+        # Add softmax activation, so that we can compute student losses
+        # with cross-entropy loss.
+        with use_ops("numpy"):
+            teacher_model = chain(teacher_step_model, softmax_activation())
+            student_model = chain(student_step_model, softmax_activation())
+        
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
@@ -268,38 +290,50 @@ class Parser(TrainablePipe):
             # sequence, we use the teacher's predictions as the gold
             # standard.
             max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states = self._init_batch(teacher_pipe, student_docs, max_moves)
+            states = self._init_batch(teacher_step_model, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
-        # We distill as follows: 1. we first let the student predict transition
-        # sequences (and the corresponding transition probabilities); (2) we
-        # let the teacher follow the student's predicted transition sequences
-        # to obtain the teacher's transition probabilities; (3) we compute the
-        # gradients of the student's transition distributions relative to the
-        # teacher's distributions.
-
-        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
-            max_moves=max_moves)
-        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
-        teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-            moves=self.moves, actions=actions)
-        (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
+        loss = 0.0
+        n_moves = 0
+        while states:
+            # We do distillation as follows: (1) for every state, we compute the
+            # transition softmax distributions: (2) we backpropagate the error of
+            # the student (compared to the teacher) into the student model; (3)
+            # for all states, we move to the next state using the student's
+            # predictions.
+            teacher_scores = teacher_model.predict(states)
+            student_scores, backprop = student_model.begin_update(states)
+            state_loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+            backprop(d_scores)
+            loss += state_loss
+            self.transition_states(states, student_scores)
+            states = [state for state in states if not state.is_final()]
+
+            # Stop when we reach the maximum number of moves, otherwise we start
+            # to process the remainder of cut sequences again.
+            if max_moves >= 1 and n_moves >= max_moves:
+                break
+            n_moves += 1
 
-        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
-        backprop_scores((student_states, d_scores))
+        backprop_tok2vec(student_docs)
 
         if sgd is not None:
             self.finish_update(sgd)
 
         losses[self.name] += loss
 
+        del backprop
+        del backprop_tok2vec
+        teacher_step_model.clear_memory()
+        student_step_model.clear_memory()
+        del teacher_model
+        del student_model
+
         return losses
 
     def get_teacher_student_loss(
-            self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
-            normalize: bool = False,
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
     ) -> Tuple[float, List[Floats2d]]:
         """Calculate the loss and its gradient for a batch of student
         scores, relative to teacher scores.
@@ -311,28 +345,10 @@ class Parser(TrainablePipe):
         
         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
         """
-
-        # We can't easily hook up a softmax layer in the parsing model, since
-        # the get_loss does additional masking. So, we could apply softmax
-        # manually here and use Thinc's cross-entropy loss. But it's a bit
-        # suboptimal, since we can have a lot of states that would result in
-        # many kernel launches. Futhermore the parsing model's backprop expects
-        # a XP array, so we'd have to concat the softmaxes anyway. So, like
-        # the get_loss implementation, we'll compute the loss and gradients
-        # ourselves.
-
-        teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
-                                                axis=-1, inplace=True)
-        student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
-                                                axis=-1, inplace=True)
-
-        assert teacher_scores.shape == student_scores.shape
-
-        d_scores = student_scores - teacher_scores
-        if normalize:
-            d_scores /= d_scores.shape[0]
-        loss = (d_scores**2).sum() / d_scores.size
-
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
         return float(loss), d_scores
 
     def init_multitask_objectives(self, get_examples, pipeline, **cfg):
@@ -355,6 +371,9 @@ class Parser(TrainablePipe):
 
         stream: The sequence of documents to process.
         batch_size (int): Number of documents to accumulate into a working set.
+        error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
+            deals with a failing batch of documents. The default function just reraises
+            the exception.
 
         YIELDS (Doc): Documents, in order.
         """
@@ -375,29 +394,78 @@ class Parser(TrainablePipe):
     def predict(self, docs):
         if isinstance(docs, Doc):
             docs = [docs]
-        self._ensure_labels_are_added(docs)
         if not any(len(doc) for doc in docs):
             result = self.moves.init_batch(docs)
             return result
-        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
-            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-            states_or_beams, _ = self.model.predict(inputs)
-        return states_or_beams
+        if self.cfg["beam_width"] == 1:
+            return self.greedy_parse(docs, drop=0.0)
+        else:
+            return self.beam_parse(
+                docs,
+                drop=0.0,
+                beam_width=self.cfg["beam_width"],
+                beam_density=self.cfg["beam_density"]
+            )
 
     def greedy_parse(self, docs, drop=0.):
-        self._resize()
+        cdef vector[StateC*] states
+        cdef StateClass state
+        cdef CBlas cblas = self._cpu_ops.cblas()
         self._ensure_labels_are_added(docs)
-        with _change_attrs(self.model, beam_width=1):
-            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-            states, _ = self.model.predict(inputs)
-        return states
+        set_dropout_rate(self.model, drop)
+        batch = self.moves.init_batch(docs)
+        model = self.model.predict(docs)
+        weights = get_c_weights(model)
+        for state in batch:
+            if not state.is_final():
+                states.push_back(state.c)
+        sizes = get_c_sizes(model, states.size())
+        with nogil:
+            self._parseC(cblas, &states[0], weights, sizes)
+        model.clear_memory()
+        del model
+        return batch
 
     def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
+        cdef Beam beam
+        cdef Doc doc
         self._ensure_labels_are_added(docs)
-        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
-            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-            beams, _ = self.model.predict(inputs)
-        return beams
+        batch = _beam_utils.BeamBatch(
+            self.moves,
+            self.moves.init_batch(docs),
+            None,
+            beam_width,
+            density=beam_density
+        )
+        model = self.model.predict(docs)
+        while not batch.is_done:
+            states = batch.get_unfinished_states()
+            if not states:
+                break
+            scores = model.predict(states)
+            batch.advance(scores)
+        model.clear_memory()
+        del model
+        return list(batch)
+
+    cdef void _parseC(self, CBlas cblas, StateC** states,
+            WeightsC weights, SizesC sizes) nogil:
+        cdef int i, j
+        cdef vector[StateC*] unfinished
+        cdef ActivationsC activations = alloc_activations(sizes)
+        while sizes.states >= 1:
+            predict_states(cblas, &activations, states, &weights, sizes)
+            # Validate actions, argmax, take action.
+            self.c_transition_batch(states,
+                activations.scores, sizes.classes, sizes.states)
+            for i in range(sizes.states):
+                if not states[i].is_final():
+                    unfinished.push_back(states[i])
+            for i in range(unfinished.size()):
+                states[i] = unfinished[i]
+            sizes.states = unfinished.size()
+            unfinished.clear()
+        free_activations(&activations)
 
     def set_annotations(self, docs, states_or_beams):
         cdef StateClass state
@@ -408,6 +476,35 @@ class Parser(TrainablePipe):
             for hook in self.postprocesses:
                 hook(doc)
 
+    def transition_states(self, states, float[:, ::1] scores):
+        cdef StateClass state
+        cdef float* c_scores = &scores[0, 0]
+        cdef vector[StateC*] c_states
+        for state in states:
+            c_states.push_back(state.c)
+        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
+        return [state for state in states if not state.c.is_final()]
+
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
+            int nr_class, int batch_size) nogil:
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        with gil:
+            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
+        is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
+        cdef int i, guess
+        cdef Transition action
+        for i in range(batch_size):
+            self.moves.set_valid(is_valid, states[i])
+            guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
+            if guess == -1:
+                # This shouldn't happen, but it's hard to raise an error here,
+                # and we don't want to infinite loop. So, force to end state.
+                states[i].force_final()
+            else:
+                action = self.moves.c[guess]
+                action.do(states[i], action.label)
+        free(is_valid)
+
     def update(self, examples, *, drop=0., sgd=None, losses=None):
         if losses is None:
             losses = {}
@@ -418,99 +515,67 @@ class Parser(TrainablePipe):
         )
         for multitask in self._multitasks:
             multitask.update(examples, drop=drop, sgd=sgd)
-        # We need to take care to act on the whole batch, because we might be
-        # getting vectors via a listener.
         n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
         if n_examples == 0:
             return losses
         set_dropout_rate(self.model, drop)
-        docs = [eg.x for eg in examples if len(eg.x)]
-
+        # The probability we use beam update, instead of falling back to
+        # a greedy update
+        beam_update_prob = self.cfg["beam_update_prob"]
+        if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
+            return self.update_beam(
+                examples,
+                beam_width=self.cfg["beam_width"],
+                sgd=sgd,
+                losses=losses,
+                beam_density=self.cfg["beam_density"]
+            )
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
             # batch uniform length.
-            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
-            init_states, gold_states, _ = self._init_gold_batch(
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            states, golds, _ = self._init_gold_batch(
                 examples,
                 max_length=max_moves
             )
         else:
-            init_states, gold_states, _ = self.moves.init_gold_batch(examples)
-
-        inputs = TransitionModelInputs(docs=docs,
-                                       moves=self.moves,
-                                       max_moves=max_moves,
-                                       states=[state.copy() for state in init_states])
-        (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
-        if sum(s.shape[0] for s in scores) == 0:
+            states, golds, _ = self.moves.init_gold_batch(examples)
+        if not states:
             return losses
-        d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
-                                 examples, max_moves)
-        backprop_scores((pred_states, d_scores))
+        model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
+ 
+        all_states = list(states)
+        states_golds = list(zip(states, golds))
+        n_moves = 0
+        while states_golds:
+            states, golds = zip(*states_golds)
+            scores, backprop = model.begin_update(states)
+            d_scores = self.get_batch_loss(states, golds, scores, losses)
+            # Note that the gradient isn't normalized by the batch size
+            # here, because our "samples" are really the states...But we
+            # can't normalize by the number of states either, as then we'd
+            # be getting smaller gradients for states in long sequences.
+            backprop(d_scores)
+            # Follow the predicted action
+            self.transition_states(states, scores)
+            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
+            if max_moves >= 1 and n_moves >= max_moves:
+                break
+            n_moves += 1
+
+        backprop_tok2vec(golds)
         if sgd not in (None, False):
             self.finish_update(sgd)
-        losses[self.name] += float((d_scores**2).sum())
         # Ugh, this is annoying. If we're working on GPU, we want to free the
         # memory ASAP. It seems that Python doesn't necessarily get around to
         # removing these in time if we don't explicitly delete? It's confusing.
-        del backprop_scores
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        del model
         return losses
 
-    def get_loss(self, states_scores, examples, max_moves):
-        gold_states, init_states, pred_states, scores = states_scores
-        scores = self.model.ops.xp.vstack(scores)
-        costs = self._get_costs_from_histories(
-            examples,
-            gold_states,
-            init_states,
-            [list(state.history) for state in pred_states],
-            max_moves
-        )
-        xp = get_array_module(scores)
-        best_costs = costs.min(axis=1, keepdims=True)
-        gscores = scores.copy()
-        min_score = scores.min() - 1000
-        assert costs.shape == scores.shape, (costs.shape, scores.shape)
-        gscores[costs > best_costs] = min_score
-        max_ = scores.max(axis=1, keepdims=True)
-        gmax = gscores.max(axis=1, keepdims=True)
-        exp_scores = xp.exp(scores - max_)
-        exp_gscores = xp.exp(gscores - gmax)
-        Z = exp_scores.sum(axis=1, keepdims=True)
-        gZ = exp_gscores.sum(axis=1, keepdims=True)
-        d_scores = exp_scores / Z
-        d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
-        return d_scores
-
-    def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
-        cdef TransitionSystem moves = self.moves
-        cdef StateClass state
-        cdef int clas
-        cdef int nO = moves.n_moves
-        cdef Pool mem = Pool()
-        cdef np.ndarray costs_i
-        is_valid = <int*>mem.alloc(nO, sizeof(int))
-        batch = list(zip(init_states, histories, gold_states))
-        n_moves = 0
-        output = []
-        while batch:
-            costs = numpy.zeros((len(batch), nO), dtype="f")
-            for i, (state, history, gold) in enumerate(batch):
-                costs_i = costs[i]
-                clas = history.pop(0)
-                moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
-                action = moves.c[clas]
-                action.do(state.c, action.label)
-                state.c.history.push_back(clas)
-            output.append(costs)
-            batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
-            if n_moves >= max_moves >= 1:
-                break
-            n_moves += 1
-
-        return self.model.ops.xp.vstack(output)
-
     def rehearse(self, examples, sgd=None, losses=None, **cfg):
         """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
         if losses is None:
@@ -520,9 +585,10 @@ class Parser(TrainablePipe):
                 multitask.rehearse(examples, losses=losses, sgd=sgd)
         if self._rehearsal_model is None:
             return None
-        losses.setdefault(self.name, 0.0)
+        losses.setdefault(self.name, 0.)
         validate_examples(examples, "Parser.rehearse")
         docs = [eg.predicted for eg in examples]
+        states = self.moves.init_batch(docs)
         # This is pretty dirty, but the NER can resize itself in init_batch,
         # if labels are missing. We therefore have to check whether we need to
         # expand our model output.
@@ -530,33 +596,85 @@ class Parser(TrainablePipe):
         # Prepare the stepwise model, and get the callback for finishing the batch
         set_dropout_rate(self._rehearsal_model, 0.0)
         set_dropout_rate(self.model, 0.0)
-        student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
-        teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
-        _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
-
-        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
-
-        teacher_scores = self.model.ops.xp.vstack(teacher_scores)
-        student_scores = self.model.ops.xp.vstack(student_scores)
-        assert teacher_scores.shape == student_scores.shape
-
-        d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
-        # If all weights for an output are 0 in the original model, don't
-        # supervise that output. This allows us to add classes.
-        loss = (d_scores**2).sum() / d_scores.size
-        backprop_scores((student_states, d_scores))
-
+        tutor, _ = self._rehearsal_model.begin_update(docs)
+        model, backprop_tok2vec = self.model.begin_update(docs)
+        n_scores = 0.
+        loss = 0.
+        while states:
+            targets, _ = tutor.begin_update(states)
+            guesses, backprop = model.begin_update(states)
+            d_scores = (guesses - targets) / targets.shape[0]
+            # If all weights for an output are 0 in the original model, don't
+            # supervise that output. This allows us to add classes.
+            loss += (d_scores**2).sum()
+            backprop(d_scores)
+            # Follow the predicted action
+            self.transition_states(states, guesses)
+            states = [state for state in states if not state.is_final()]
+            n_scores += d_scores.size
+        # Do the backprop
+        backprop_tok2vec(docs)
         if sgd is not None:
             self.finish_update(sgd)
-        losses[self.name] += loss
-
+        losses[self.name] += loss / n_scores
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        tutor.clear_memory()
+        del model
+        del tutor
         return losses
 
-    def update_beam(self, examples, *, beam_width, drop=0.,
-                    sgd=None, losses=None, beam_density=0.0):
-        raise NotImplementedError
+    def update_beam(self, examples, *, beam_width,
+            drop=0., sgd=None, losses=None, beam_density=0.0):
+        states, golds, _ = self.moves.init_gold_batch(examples)
+        if not states:
+            return losses
+        # Prepare the stepwise model, and get the callback for finishing the batch
+        model, backprop_tok2vec = self.model.begin_update(
+            [eg.predicted for eg in examples])
+        loss = _beam_utils.update_beam(
+            self.moves,
+            states,
+            golds,
+            model,
+            beam_width,
+            beam_density=beam_density,
+        )
+        losses[self.name] += loss
+        backprop_tok2vec(golds)
+        if sgd is not None:
+            self.finish_update(sgd)
+
+    def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
+        cdef StateClass state
+        cdef Pool mem = Pool()
+        cdef int i
+
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
+
+        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
+        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
+        cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
+                                        dtype='f', order='C')
+        c_d_scores = <float*>d_scores.data
+        unseen_classes = self.model.attrs["unseen_classes"]
+        for i, (state, gold) in enumerate(zip(states, golds)):
+            memset(is_valid, 0, self.moves.n_moves * sizeof(int))
+            memset(costs, 0, self.moves.n_moves * sizeof(float))
+            self.moves.set_costs(is_valid, costs, state.c, gold)
+            for j in range(self.moves.n_moves):
+                if costs[j] <= 0.0 and j in unseen_classes:
+                    unseen_classes.remove(j)
+            cpu_log_loss(c_d_scores,
+                costs, is_valid, &scores[i, 0], d_scores.shape[1])
+            c_d_scores += d_scores.shape[1]
+        # Note that we don't normalize this. See comment in update() for why.
+        if losses is not None:
+            losses.setdefault(self.name, 0.)
+            losses[self.name] += (d_scores**2).sum()
+        return d_scores
 
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
@@ -595,7 +713,7 @@ class Parser(TrainablePipe):
             for example in islice(get_examples(), 10):
                 doc_sample.append(example.predicted)
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize((doc_sample, self.moves))
+        self.model.initialize(doc_sample)
         if nlp is not None:
             self.init_multitask_objectives(get_examples, nlp.pipeline)
 
@@ -688,27 +806,26 @@ class Parser(TrainablePipe):
 
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
-        sequence or a cap. A long doc will get multiple states. Let's say we
-        have a doc of length 2*N, where N is the shortest doc. We'll make
-        two states, one representing long_doc[:N], and another representing
-        long_doc[N:]."""
+        sequence or a cap. A long
+        doc will get multiple states. Let's say we have a doc of length 2*N,
+        where N is the shortest doc. We'll make two states, one representing
+        long_doc[:N], and another representing long_doc[N:]."""
         cdef:
             StateClass start_state
             StateClass state
             Transition action
-            TransitionSystem moves = self.moves
-        all_states = moves.init_batch([eg.predicted for eg in examples])
+        all_states = self.moves.init_batch([eg.predicted for eg in examples])
         states = []
         golds = []
         to_cut = []
         for state, eg in zip(all_states, examples):
-            if moves.has_gold(eg) and not state.is_final():
-                gold = moves.init_gold(state, eg)
+            if self.moves.has_gold(eg) and not state.is_final():
+                gold = self.moves.init_gold(state, eg)
                 if len(eg.x) < max_length:
                     states.append(state)
                     golds.append(gold)
                 else:
-                    oracle_actions = moves.get_oracle_sequence_from_state(
+                    oracle_actions = self.moves.get_oracle_sequence_from_state(
                         state.copy(), gold)
                     to_cut.append((eg, state, gold, oracle_actions))
         if not to_cut:
@@ -718,52 +835,13 @@ class Parser(TrainablePipe):
             for i in range(0, len(oracle_actions), max_length):
                 start_state = state.copy()
                 for clas in oracle_actions[i:i+max_length]:
-                    action = moves.c[clas]
+                    action = self.moves.c[clas]
                     action.do(state.c, action.label)
                     if state.is_final():
                         break
-                if moves.has_gold(eg, start_state.B(0), state.B(0)):
+                if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
                     states.append(start_state)
                     golds.append(gold)
                 if state.is_final():
                     break
         return states, golds, max_length
-
-
-@contextlib.contextmanager
-def _change_attrs(model, **kwargs):
-    """Temporarily modify a thinc model's attributes."""
-    unset = object()
-    old_attrs = {}
-    for key, value in kwargs.items():
-        old_attrs[key] = model.attrs.get(key, unset)
-        model.attrs[key] = value
-    yield model
-    for key, value in old_attrs.items():
-        if value is unset:
-            model.attrs.pop(key)
-        else:
-            model.attrs[key] = value
-
-
-def states2actions(states: List[StateClass]) -> List[Ints1d]:
-    cdef int step
-    cdef StateClass state
-    cdef StateC* c_state
-    actions = []
-    while True:
-        step = len(actions)
-
-        step_actions = []
-        for state in states:
-            c_state = state.c
-            if step < c_state.history.size():
-                step_actions.append(c_state.history[step])
-
-        # We are done if we have exhausted all histories.
-        if len(step_actions) == 0:
-            break
-
-        actions.append(numpy.array(step_actions, dtype="i"))
-
-    return actions
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index bb9b7653ce3..5d5fbb02790 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,6 +17,7 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
+import logging
 
 from ..util import make_tempdir
 
@@ -413,7 +414,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.initialize(get_examples=lambda: train_examples)
+    nlp.initialize()
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -540,11 +541,11 @@ def test_block_ner():
     assert [token.ent_type_ for token in doc] == expected_types
 
 
-def test_overfitting_IO():
-    fix_random_seed(1)
+@pytest.mark.parametrize("use_upper", [True, False])
+def test_overfitting_IO(use_upper):
     # Simple test to try and quickly overfit the NER component
     nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {}})
+    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -576,6 +577,7 @@ def test_overfitting_IO():
         assert ents2[0].label_ == "LOC"
         # Ensure that the predictions are still the same, even after adding a new label
         ner2 = nlp2.get_pipe("ner")
+        assert ner2.model.attrs["has_upper"] == use_upper
         ner2.add_label("RANDOM_NEW_LABEL")
         doc3 = nlp2(test_text)
         ents3 = doc3.ents
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index d25eb165acb..42cf5ced998 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,6 +1,3 @@
-import itertools
-
-import numpy
 import pytest
 from numpy.testing import assert_equal
 from thinc.api import Adam, fix_random_seed
@@ -62,8 +59,6 @@
     ),
 ]
 
-PARSERS = ["parser"]  # TODO: Test beam_parser when ready
-
 eps = 0.1
 
 
@@ -176,57 +171,6 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
     assert doc[0].dep != 0
 
 
-def test_parser_apply_actions(en_vocab, en_parser):
-    words = ["I", "ate", "pizza"]
-    words2 = ["Eat", "more", "pizza", "!"]
-    doc1 = Doc(en_vocab, words=words)
-    doc2 = Doc(en_vocab, words=words2)
-    docs = [doc1, doc2]
-
-    moves = en_parser.moves
-    moves.add_action(0, "")
-    moves.add_action(1, "")
-    moves.add_action(2, "nsubj")
-    moves.add_action(3, "obj")
-    moves.add_action(2, "amod")
-
-    actions = [
-        numpy.array([0, 0], dtype="i"),
-        numpy.array([2, 0], dtype="i"),
-        numpy.array([0, 4], dtype="i"),
-        numpy.array([3, 3], dtype="i"),
-        numpy.array([1, 1], dtype="i"),
-        numpy.array([1, 1], dtype="i"),
-        numpy.array([0], dtype="i"),
-        numpy.array([1], dtype="i"),
-    ]
-
-    states = moves.init_batch(docs)
-    active_states = states
-
-    for step_actions in actions:
-        active_states = moves.apply_actions(active_states, step_actions)
-
-    assert len(active_states) == 0
-
-    for (state, doc) in zip(states, docs):
-        moves.set_annotations(state, doc)
-
-    assert docs[0][0].head.i == 1
-    assert docs[0][0].dep_ == "nsubj"
-    assert docs[0][1].head.i == 1
-    assert docs[0][1].dep_ == "ROOT"
-    assert docs[0][2].head.i == 1
-    assert docs[0][2].dep_ == "obj"
-
-    assert docs[1][0].head.i == 0
-    assert docs[1][0].dep_ == "ROOT"
-    assert docs[1][1].head.i == 2
-    assert docs[1][1].dep_ == "amod"
-    assert docs[1][2].head.i == 0
-    assert docs[1][2].dep_ == "obj"
-
-
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
@@ -375,7 +319,7 @@ def test_parser_constructor(en_vocab):
     DependencyParser(en_vocab, model)
 
 
-@pytest.mark.parametrize("pipe_name", PARSERS)
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
 def test_incomplete_data(pipe_name):
     # Test that the parser works with incomplete information
     nlp = English()
@@ -401,15 +345,11 @@ def test_incomplete_data(pipe_name):
     assert doc[2].head.i == 1
 
 
-@pytest.mark.parametrize(
-    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
-)
-def test_overfitting_IO(pipe_name, max_moves):
-    fix_random_seed(0)
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+def test_overfitting_IO(pipe_name):
     # Simple test to try and quickly overfit the dependency parser (normal or beam)
     nlp = English()
     parser = nlp.add_pipe(pipe_name)
-    parser.cfg["update_with_oracle_cut_size"] = max_moves
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -511,12 +451,10 @@ def test_distill():
 @pytest.mark.parametrize(
     "parser_config",
     [
-        # TODO: re-enable after we have a spacy-legacy release for v4. See
-        # https://github.com/explosion/spacy-legacy/pull/36
-        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        # TransitionBasedParser V1
+        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        # TransitionBasedParser V2
         ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
-        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
     ],
 )
 # fmt: on
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index f6cefbc1f84..c3c4bb6c686 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -384,7 +384,7 @@ def test_replace_listeners():
     factory = "ner"
 
     [components.ner.model]
-    @architectures = "spacy.TransitionBasedParser.v3"
+    @architectures = "spacy.TransitionBasedParser.v2"
 
     [components.ner.model.tok2vec]
     @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index b351ea80121..8a1c74ca9ed 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -189,11 +189,33 @@
 
 parser_config_string_upper = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
+use_upper = true
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 333
+depth = 4
+embed_size = 5555
+window_size = 1
+maxout_pieces = 7
+subword_features = false
+"""
+
+
+parser_config_string_no_upper = """
+[model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "parser"
+extra_state_tokens = false
+hidden_width = 66
+maxout_pieces = 2
+use_upper = false
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
@@ -224,6 +246,7 @@ def my_parser():
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
+        use_upper=True,
     )
     return parser
 
@@ -337,16 +360,15 @@ def test_serialize_custom_nlp():
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        assert model.get_ref("tok2vec") is not None
-        assert model.has_param("hidden_W")
-        assert model.has_param("hidden_b")
-        output = model.get_ref("output")
-        assert output is not None
-        assert output.has_param("W")
-        assert output.has_param("b")
+        model.get_ref("tok2vec")
+        # check that we have the correct settings, not the default ones
+        assert model.get_ref("upper").get_dim("nI") == 65
+        assert model.get_ref("lower").get_dim("nI") == 65
 
 
-@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
+@pytest.mark.parametrize(
+    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
+)
 def test_serialize_parser(parser_config_string):
     """Create a non-default parser config to check nlp serializes it correctly"""
     nlp = English()
@@ -359,13 +381,11 @@ def test_serialize_parser(parser_config_string):
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        assert model.get_ref("tok2vec") is not None
-        assert model.has_param("hidden_W")
-        assert model.has_param("hidden_b")
-        output = model.get_ref("output")
-        assert output is not None
-        assert output.has_param("b")
-        assert output.has_param("W")
+        model.get_ref("tok2vec")
+        # check that we have the correct settings, not the default ones
+        if model.attrs["has_upper"]:
+            assert model.get_ref("upper").get_dim("nI") == 66
+        assert model.get_ref("lower").get_dim("nI") == 66
 
 
 def test_config_nlp_roundtrip():
@@ -561,7 +581,9 @@ def test_config_auto_fill_extra_fields():
     load_model_from_config(nlp.config)
 
 
-@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
+@pytest.mark.parametrize(
+    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
+)
 def test_config_validate_literal(parser_config_string):
     nlp = English()
     config = Config().from_str(parser_config_string)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index c05ef625e11..44717f6eba2 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,19 +1,22 @@
 import ctypes
 import os
 from pathlib import Path
-
 import pytest
-from pydantic import ValidationError
-from thinc.api import (
-    Config,
-    ConfigValidationError,
-    CupyOps,
-    MPSOps,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-    set_current_ops,
-)
+
+try:
+    from pydantic.v1 import ValidationError
+except ImportError:
+    from pydantic import ValidationError  # type: ignore
+
+from spacy.about import __version__ as spacy_version
+from spacy import util
+from spacy import prefer_gpu, require_gpu, require_cpu
+from spacy.ml._precomputable_affine import PrecomputableAffine
+from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
+from spacy.util import dot_to_object, SimpleFrozenList, import_file
+from spacy.util import to_ternary_int, find_available_port
+from thinc.api import Config, Optimizer, ConfigValidationError
+from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
@@ -96,6 +99,34 @@ def test_util_get_package_path(package):
     assert isinstance(path, Path)
 
 
+def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
+    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
+    assert model.get_param("W").shape == (nF, nO, nP, nI)
+    tensor = model.ops.alloc((10, nI))
+    Y, get_dX = model.begin_update(tensor)
+    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
+    dY = model.ops.alloc((15, nO, nP))
+    ids = model.ops.alloc((15, nF))
+    ids[1, 2] = -1
+    dY[1] = 1
+    assert not model.has_grad("pad")
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 2, 0, 0] == 1.0
+    ids.fill(0.0)
+    dY.fill(0.0)
+    dY[0] = 0
+    ids[1, 2] = 0
+    ids[1, 1] = -1
+    ids[1, 0] = -1
+    dY[1] = 1
+    ids[2, 0] = -1
+    dY[2] = 5
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 0, 0, 0] == 6
+    assert d_pad[0, 1, 0, 0] == 1
+    assert d_pad[0, 2, 0, 0] == 0
+
+
 def test_prefer_gpu():
     current_ops = get_current_ops()
     if has_cupy_gpu:
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 914e877f579..8217de5bfe7 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,6 +1,6 @@
 # cython: profile=False
 from collections.abc import Iterable as IterableInstance
-
+import warnings
 import numpy
 
 from murmurhash.mrmr cimport hash64
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index db8f974ea19..956234ac0d4 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -833,17 +833,18 @@ for a Tok2Vec layer.
 
 ## Parser & NER architectures {id="parser"}
 
-### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
+### spacy.TransitionBasedParser.v2 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TransitionBasedParser.v3"
+> @architectures = "spacy.TransitionBasedParser.v2"
 > state_type = "ner"
 > extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
+> use_upper = true
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v2"
@@ -873,22 +874,23 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                 | Description                                                                                                                                                       |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                        |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                               |
-| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
-| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                            |
-| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~                                                             |
-| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                       |
-| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                     |
+| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
+| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
 
 <Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>
 
 [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
 same signature, but the `use_upper` argument was `True` by default.
 
- </Accordion>
+</Accordion>
 
 ## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 765bcb8c675..1fae1dc6cda 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -406,7 +406,7 @@ Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v3
+Name       spacy.TransitionBasedParser.v1
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
@@ -416,7 +416,7 @@ Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v3
+Name       spacy.TransitionBasedParser.v1
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]
@@ -741,7 +741,7 @@ scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 - hidden_width = 64
@@ -764,7 +764,7 @@ scorer = {"@scorers":"spacy.parser_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx
index 44c80622437..b44df538766 100644
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@@ -302,7 +302,7 @@ the others, but may not be as accurate, especially if texts are short.
 ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 
 Identical to
-[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser)
+[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
 except the `use_upper` was set to `True` by default.
 
 ## Layers {id="layers"}
diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 1b0bc9606e9..2bd2856b6a3 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -140,7 +140,7 @@ factory = "tok2vec"
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v1"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -156,7 +156,7 @@ same. This makes them fully independent and doesn't require an upstream
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v1"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
@@ -472,7 +472,7 @@ sneakily delegates to the `Transformer` pipeline component.
 factory = "ner"
 
 [nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v1"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 128

From df3d864cba1168a5356cac065a9f1cd234ccd21d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:24:09 +0100
Subject: [PATCH 199/504] isort

---
 spacy/ml/models/parser.py                   |  9 +++--
 spacy/ml/parser_model.pxd                   |  5 ++-
 spacy/ml/parser_model.pyx                   |  7 ++--
 spacy/ml/tb_framework.py                    |  3 +-
 spacy/pipeline/_parser_internals/_state.pxd |  3 +-
 spacy/pipeline/dep_parser.pyx               |  3 +-
 spacy/pipeline/ner.pyx                      |  9 +++--
 spacy/pipeline/transition_parser.pxd        |  6 +--
 spacy/pipeline/transition_parser.pyx        | 45 +++++++++++++--------
 spacy/tests/parser/test_ner.py              |  1 -
 spacy/tests/test_misc.py                    | 20 ++++-----
 spacy/training/example.pyx                  |  4 +-
 12 files changed, 67 insertions(+), 48 deletions(-)

diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index a70d84dea8f..f6c0e565dd3 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,13 +1,14 @@
-from typing import Optional, List, cast
-from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
+from typing import List, Optional, cast
+
+from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
 from thinc.types import Floats2d
 
-from ...errors import Errors
 from ...compat import Literal
+from ...errors import Errors
+from ...tokens import Doc
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
-from ...tokens import Doc
 
 
 @registry.architectures("spacy.TransitionBasedParser.v2")
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
index 8def6cea53f..ca31c169964 100644
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@@ -1,7 +1,8 @@
-from libc.string cimport memset, memcpy
+from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport CBlas
-from ..typedefs cimport weight_t, hash_t
+
 from ..pipeline._parser_internals._state cimport StateC
+from ..typedefs cimport hash_t, weight_t
 
 
 cdef struct SizesC:
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 91558683b60..90e836f8a0a 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -1,18 +1,19 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
 cimport numpy as np
 from libc.math cimport exp
-from libc.string cimport memset, memcpy
 from libc.stdlib cimport calloc, free, realloc
+from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport saxpy, sgemm
 
 import numpy
 import numpy.random
-from thinc.api import Model, CupyOps, NumpyOps, get_ops
+from thinc.api import CupyOps, Model, NumpyOps, get_ops
 
 from .. import util
 from ..errors import Errors
-from ..typedefs cimport weight_t, class_t, hash_t
+
 from ..pipeline._parser_internals.stateclass cimport StateClass
+from ..typedefs cimport class_t, hash_t, weight_t
 
 
 cdef WeightsC get_c_weights(model) except *:
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
index ab4a969e24e..e351ad4e570 100644
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@@ -1,6 +1,7 @@
 from thinc.api import Model, noop
-from .parser_model import ParserStepModel
+
 from ..util import registry
+from .parser_model import ParserStepModel
 
 
 @registry.layers("spacy.TransitionModel.v1")
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 04274ce8af1..c063cf97cd4 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,4 +1,5 @@
 cimport libcpp
+from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as incr
 from libc.stdint cimport uint32_t, uint64_t
@@ -7,8 +8,6 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.set cimport set
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 3e59deaae4f..1fdc55dab41 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -5,8 +5,9 @@ from typing import Callable, Iterable, Optional
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
+
 from ._parser_internals.arc_eager cimport ArcEager
+from .transition_parser cimport Parser
 
 from ..language import Language
 from ..scorer import Scorer
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index c73ec5c528a..29f22b0174d 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -10,13 +10,14 @@ from ..training import remove_bilu_prefix, validate_examples
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
+
 from ._parser_internals.ner cimport BiluoPushDown
+from .transition_parser cimport Parser
+
 from ..language import Language
-from ..scorer import get_ner_prf, PRFScore
-from ..util import registry
+from ..scorer import PRFScore, get_ner_prf
 from ..training import remove_bilu_prefix
-
+from ..util import registry
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index f20e69a6e56..a48d76b6819 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -1,11 +1,11 @@
 from cymem.cymem cimport Pool
 from thinc.backends.cblas cimport CBlas
 
+from ..ml.parser_model cimport ActivationsC, SizesC, WeightsC
 from ..vocab cimport Vocab
-from .trainable_pipe cimport TrainablePipe
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
 from ._parser_internals._state cimport StateC
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from .trainable_pipe cimport TrainablePipe
 
 
 cdef class Parser(TrainablePipe):
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 4290420c788..2fb3af44ddf 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,21 +1,20 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
+
 from typing import Dict, Iterable, List, Optional, Tuple
-from cymem.cymem cimport Pool
+
 cimport numpy as np
+from cymem.cymem cimport Pool
+
 from itertools import islice
-from libcpp.vector cimport vector
-from libc.string cimport memset, memcpy
+
 from libc.stdlib cimport calloc, free
+from libc.string cimport memcpy, memset
+from libcpp.vector cimport vector
+
 import random
 
-import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
-from thinc.api import chain, softmax_activation, use_ops
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d
-import numpy.random
 import numpy
 import numpy.random
 import srsly
@@ -23,21 +22,36 @@ from thinc.api import (
     CupyOps,
     NumpyOps,
     Optimizer,
+    chain,
     get_array_module,
     get_ops,
     set_dropout_rate,
+    softmax_activation,
+    use_ops,
 )
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 
-from ._parser_internals.stateclass cimport StateClass
-from ._parser_internals.search cimport Beam
-from ..ml.parser_model cimport alloc_activations, free_activations
-from ..ml.parser_model cimport predict_states, arg_max_if_valid
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
-from ..ml.parser_model cimport get_c_weights, get_c_sizes
+from ..ml.parser_model cimport (
+    ActivationsC,
+    SizesC,
+    WeightsC,
+    alloc_activations,
+    arg_max_if_valid,
+    cpu_log_loss,
+    free_activations,
+    get_c_sizes,
+    get_c_weights,
+    predict_states,
+)
 from ..tokens.doc cimport Doc
+from ._parser_internals.search cimport Beam
+from ._parser_internals.stateclass cimport StateClass
+
 from .trainable_pipe import TrainablePipe
+
 from ._parser_internals cimport _beam_utils
+
 from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
@@ -62,7 +76,6 @@ cdef extern from "<algorithm>" namespace "std" nogil:
     bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
 
 
-
 NUMPY_OPS = NumpyOps()
 
 
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 5d5fbb02790..b6848d380fe 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,7 +17,6 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
-import logging
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 44717f6eba2..d2a41ff0fed 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,6 +1,7 @@
 import ctypes
 import os
 from pathlib import Path
+
 import pytest
 
 try:
@@ -8,15 +9,16 @@
 except ImportError:
     from pydantic import ValidationError  # type: ignore
 
-from spacy.about import __version__ as spacy_version
-from spacy import util
-from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.ml._precomputable_affine import PrecomputableAffine
-from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
-from spacy.util import dot_to_object, SimpleFrozenList, import_file
-from spacy.util import to_ternary_int, find_available_port
-from thinc.api import Config, Optimizer, ConfigValidationError
-from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
+from thinc.api import (
+    Config,
+    ConfigValidationError,
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+    set_current_ops,
+)
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 8217de5bfe7..e41f9e02eb3 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,6 +1,6 @@
-# cython: profile=False
-from collections.abc import Iterable as IterableInstance
 import warnings
+from collections.abc import Iterable as IterableInstance
+
 import numpy
 
 from murmurhash.mrmr cimport hash64

From b33f1d172de621d6e8d1eb50ff41eb96754f4ec9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:38:01 +0100
Subject: [PATCH 200/504] Add distillation tests with max cut size

And fix endless loop when the max cut size is 0 or 1.
---
 spacy/pipeline/transition_parser.pyx | 2 +-
 spacy/tests/parser/test_ner.py       | 5 ++++-
 spacy/tests/parser/test_parse.py     | 5 ++++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 2fb3af44ddf..17a4fdb1b93 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -302,7 +302,7 @@ cdef class Parser(TrainablePipe):
             # batch uniform length. Since we do not have a gold standard
             # sequence, we use the teacher's predictions as the gold
             # standard.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
             states = self._init_batch(teacher_step_model, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index b6848d380fe..7c3a9d56249 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -624,7 +624,9 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -642,6 +644,7 @@ def test_distill():
 
     student = English()
     student_ner = student.add_pipe("ner")
+    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 42cf5ced998..dbede7edd52 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -402,7 +402,9 @@ def test_is_distillable():
     assert parser.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []
@@ -420,6 +422,7 @@ def test_distill():
 
     student = English()
     student_parser = student.add_pipe("parser")
+    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
     student_parser.initialize(
         get_examples=lambda: train_examples, labels=teacher_parser.label_data
     )

From 61ed8f19f893b6c7b3c35be37bf29b2f5f700959 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 18 Dec 2023 20:02:15 +0100
Subject: [PATCH 201/504] Fix Cython lints

---
 spacy/ml/parser_model.pxd                |  9 ++--
 spacy/ml/parser_model.pyx                | 64 ++++++++++++------------
 spacy/pipeline/_parser_internals/ner.pyx |  4 +-
 spacy/pipeline/dep_parser.pyx            |  1 +
 spacy/pipeline/ner.pyx                   |  3 +-
 spacy/pipeline/transition_parser.pxd     |  4 +-
 spacy/pipeline/transition_parser.pyx     | 42 ++++++----------
 spacy/training/example.pyx               |  1 -
 8 files changed, 58 insertions(+), 70 deletions(-)

diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
index ca31c169964..88386255147 100644
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@@ -41,10 +41,9 @@ cdef ActivationsC alloc_activations(SizesC n) nogil
 cdef void free_activations(const ActivationsC* A) nogil
 
 cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil
- 
+                         const WeightsC* W, SizesC n) nogil
+
 cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
 
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores, int O) nogil
- 
+cdef void cpu_log_loss(float* d_scores, const float* costs,
+                       const int* is_valid, const float* scores, int O) nogil
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 90e836f8a0a..843275f4c8b 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -13,7 +13,7 @@ from .. import util
 from ..errors import Errors
 
 from ..pipeline._parser_internals.stateclass cimport StateClass
-from ..typedefs cimport class_t, hash_t, weight_t
+from ..typedefs cimport weight_t
 
 
 cdef WeightsC get_c_weights(model) except *:
@@ -78,31 +78,31 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
         A._max_size = n.states
     else:
         A.token_ids = <int*>realloc(A.token_ids,
-            n.states * n.feats * sizeof(A.token_ids[0]))
+                                    n.states * n.feats * sizeof(A.token_ids[0]))
         A.scores = <float*>realloc(A.scores,
-            n.states * n.classes * sizeof(A.scores[0]))
+                                   n.states * n.classes * sizeof(A.scores[0]))
         A.unmaxed = <float*>realloc(A.unmaxed,
-            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
         A.hiddens = <float*>realloc(A.hiddens,
-            n.states * n.hiddens * sizeof(A.hiddens[0]))
+                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
         A.is_valid = <int*>realloc(A.is_valid,
-            n.states * n.classes * sizeof(A.is_valid[0]))
+                                   n.states * n.classes * sizeof(A.is_valid[0]))
         A._max_size = n.states
     A._curr_size = n.states
 
 
 cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil:
-    cdef double one = 1.0
+                         const WeightsC* W, SizesC n) nogil:
     resize_activations(A, n)
     for i in range(n.states):
         states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
     memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
     memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
-    sum_state_features(cblas, A.unmaxed,
-        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
+    sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n.states,
+                       n.feats, n.hiddens * n.pieces)
     for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1,
+                     &A.unmaxed[i*n.hiddens*n.pieces], 1)
         for j in range(n.hiddens):
             index = i * n.hiddens * n.pieces + j * n.pieces
             which = _arg_max(&A.unmaxed[index], n.pieces)
@@ -112,10 +112,10 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
         memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
     else:
         # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-            1.0, <const float *>A.hiddens, n.hiddens,
-            <const float *>W.hidden_weights, n.hiddens,
-            0.0, A.scores, n.classes)
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens, 1.0,
+                     <const float *>A.hiddens, n.hiddens,
+                     <const float *>W.hidden_weights, n.hiddens, 0.0,
+                     A.scores, n.classes)
         # Add bias
         for i in range(n.states):
             saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
@@ -131,9 +131,9 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
                 A.scores[i*n.classes+j] = min_
 
 
-cdef void sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, int B, int F, int O) nogil:
-    cdef int idx, b, f, i
+cdef void sum_state_features(CBlas cblas, float* output, const float* cached,
+                             const int* token_ids, int B, int F, int O) nogil:
+    cdef int idx, b, f
     cdef const float* feature
     padding = cached
     cached += F * O
@@ -150,9 +150,8 @@ cdef void sum_state_features(CBlas cblas, float* output,
         token_ids += F
 
 
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores,
-        int O) nogil:
+cdef void cpu_log_loss(float* d_scores, const float* costs, const int* is_valid,
+                       const float* scores, int O) nogil:
     """Do multi-label log loss"""
     cdef double max_, gmax, Z, gZ
     best = arg_max_if_gold(scores, costs, is_valid, O)
@@ -178,7 +177,7 @@ cdef void cpu_log_loss(float* d_scores,
 
 
 cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
-        const int* is_valid, int n) nogil:
+                         const int* is_valid, int n) nogil:
     # Find minimum cost
     cdef float cost = 1
     for i in range(n):
@@ -202,10 +201,9 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
     return best
 
 
-
 class ParserStepModel(Model):
     def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
-            dropout=0.1):
+                 dropout=0.1):
         Model.__init__(self, name="parser_step_model", forward=step_forward)
         self.attrs["has_upper"] = has_upper
         self.attrs["dropout_rate"] = dropout
@@ -267,7 +265,7 @@ class ParserStepModel(Model):
 
     def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
         if isinstance(self.state2vec.ops, CupyOps) \
-        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+           and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
             # Move token_ids and d_vector to GPU, asynchronously
             self.backprops.append((
                 util.get_async(self.cuda_stream, token_ids),
@@ -277,7 +275,6 @@ class ParserStepModel(Model):
         else:
             self.backprops.append((token_ids, d_vector, get_d_tokvecs))
 
-
     def finish_steps(self, golds):
         # Add a padding vector to the d_tokvecs gradient, so that missing
         # values don't affect the real gradient.
@@ -290,14 +287,15 @@ class ParserStepModel(Model):
             ids = ids.flatten()
             d_state_features = d_state_features.reshape(
                 (ids.size, d_state_features.shape[2]))
-            self.ops.scatter_add(d_tokvecs, ids,
-                d_state_features)
+            self.ops.scatter_add(d_tokvecs, ids, d_state_features)
         # Padded -- see update()
         self.bp_tokvecs(d_tokvecs[:-1])
         return d_tokvecs
 
+
 NUMPY_OPS = NumpyOps()
 
+
 def step_forward(model: ParserStepModel, states, is_train):
     token_ids = model.get_token_ids(states)
     vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
@@ -310,7 +308,7 @@ def step_forward(model: ParserStepModel, states, is_train):
         scores, get_d_vector = model.vec2scores(vector, is_train)
     else:
         scores = NumpyOps().asarray(vector)
-        get_d_vector = lambda d_scores: d_scores
+        def get_d_vector(d_scores): return d_scores
     # If the class is unseen, make sure its score is minimum
     scores[:, model._class_mask == 0] = numpy.nanmin(scores)
 
@@ -445,8 +443,8 @@ cdef class precompute_hiddens:
         feat_weights = self.get_feat_weights()
         cdef int[:, ::1] ids = token_ids
         sum_state_features(cblas, <float*>state_vector.data,
-            feat_weights, &ids[0,0],
-            token_ids.shape[0], self.nF, self.nO*self.nP)
+                           feat_weights, &ids[0, 0], token_ids.shape[0],
+                           self.nF, self.nO*self.nP)
         state_vector += self.bias
         state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
 
@@ -471,7 +469,7 @@ cdef class precompute_hiddens:
 
         def backprop_maxout(d_best):
             return self.ops.backprop_maxout(d_best, mask, self.nP)
-        
+
         return state_vector, backprop_maxout
 
     def _relu_nonlinearity(self, state_vector):
@@ -485,7 +483,7 @@ cdef class precompute_hiddens:
         def backprop_relu(d_best):
             d_best *= mask
             return d_best.reshape((d_best.shape + (1,)))
- 
+
         return state_vector, backprop_relu
 
 cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 0b9980ddbf2..be769bd9cd0 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -159,7 +159,7 @@ cdef class BiluoPushDown(TransitionSystem):
             if token.ent_type:
                 labels.add(token.ent_type_)
         return labels
-    
+
     def move_name(self, int move, attr_t label):
         if move == OUT:
             return 'O'
@@ -644,7 +644,7 @@ cdef class Unit:
                 cost += 1
                 break
         return cost
- 
+
 
 
 cdef class Out:
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 1fdc55dab41..cbd7187ff0f 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -127,6 +127,7 @@ def make_parser(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_parser",
     assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 29f22b0174d..fe54d33a17b 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -15,7 +15,7 @@ from ._parser_internals.ner cimport BiluoPushDown
 from .transition_parser cimport Parser
 
 from ..language import Language
-from ..scorer import PRFScore, get_ner_prf
+from ..scorer import get_ner_prf
 from ..training import remove_bilu_prefix
 from ..util import registry
 
@@ -105,6 +105,7 @@ def make_ner(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_ner",
     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index a48d76b6819..7adb82213de 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -15,7 +15,7 @@ cdef class Parser(TrainablePipe):
     cdef object _cpu_ops
 
     cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil
+                      WeightsC weights, SizesC sizes) nogil
 
     cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil
+                                 int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 17a4fdb1b93..fa9a76772ec 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -10,7 +10,7 @@ from cymem.cymem cimport Pool
 from itertools import islice
 
 from libc.stdlib cimport calloc, free
-from libc.string cimport memcpy, memset
+from libc.string cimport memset
 from libcpp.vector cimport vector
 
 import random
@@ -23,14 +23,13 @@ from thinc.api import (
     NumpyOps,
     Optimizer,
     chain,
-    get_array_module,
     get_ops,
     set_dropout_rate,
     softmax_activation,
     use_ops,
 )
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
+from thinc.types import Floats2d
 
 from ..ml.parser_model cimport (
     ActivationsC,
@@ -45,7 +44,6 @@ from ..ml.parser_model cimport (
     predict_states,
 )
 from ..tokens.doc cimport Doc
-from ._parser_internals.search cimport Beam
 from ._parser_internals.stateclass cimport StateClass
 
 from .trainable_pipe import TrainablePipe
@@ -55,11 +53,10 @@ from ._parser_internals cimport _beam_utils
 from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
-from ..typedefs cimport weight_t
 from ..vocab cimport Vocab
 from ._parser_internals cimport _beam_utils
 from ._parser_internals.stateclass cimport StateC, StateClass
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ._parser_internals.transition_system cimport Transition
 from .trainable_pipe cimport TrainablePipe
 
 from .. import util
@@ -295,7 +292,7 @@ cdef class Parser(TrainablePipe):
         with use_ops("numpy"):
             teacher_model = chain(teacher_step_model, softmax_activation())
             student_model = chain(student_step_model, softmax_activation())
-        
+
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
@@ -440,8 +437,6 @@ cdef class Parser(TrainablePipe):
         return batch
 
     def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
-        cdef Beam beam
-        cdef Doc doc
         self._ensure_labels_are_added(docs)
         batch = _beam_utils.BeamBatch(
             self.moves,
@@ -462,15 +457,15 @@ cdef class Parser(TrainablePipe):
         return list(batch)
 
     cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil:
-        cdef int i, j
+                      WeightsC weights, SizesC sizes) nogil:
+        cdef int i
         cdef vector[StateC*] unfinished
         cdef ActivationsC activations = alloc_activations(sizes)
         while sizes.states >= 1:
             predict_states(cblas, &activations, states, &weights, sizes)
             # Validate actions, argmax, take action.
-            self.c_transition_batch(states,
-                activations.scores, sizes.classes, sizes.states)
+            self.c_transition_batch(states, activations.scores,
+                                    sizes.classes, sizes.states)
             for i in range(sizes.states):
                 if not states[i].is_final():
                     unfinished.push_back(states[i])
@@ -499,7 +494,7 @@ cdef class Parser(TrainablePipe):
         return [state for state in states if not state.c.is_final()]
 
     cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil:
+                                 int nr_class, int batch_size) nogil:
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         with gil:
             assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
@@ -557,8 +552,7 @@ cdef class Parser(TrainablePipe):
         if not states:
             return losses
         model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
- 
-        all_states = list(states)
+
         states_golds = list(zip(states, golds))
         n_moves = 0
         while states_golds:
@@ -638,8 +632,8 @@ cdef class Parser(TrainablePipe):
         del tutor
         return losses
 
-    def update_beam(self, examples, *, beam_width,
-            drop=0., sgd=None, losses=None, beam_density=0.0):
+    def update_beam(self, examples, *, beam_width, drop=0., sgd=None,
+                    losses=None, beam_density=0.0):
         states, golds, _ = self.moves.init_gold_batch(examples)
         if not states:
             return losses
@@ -670,7 +664,7 @@ cdef class Parser(TrainablePipe):
         is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
         costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
         cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
-                                        dtype='f', order='C')
+                                               dtype='f', order='C')
         c_d_scores = <float*>d_scores.data
         unseen_classes = self.model.attrs["unseen_classes"]
         for i, (state, gold) in enumerate(zip(states, golds)):
@@ -680,8 +674,8 @@ cdef class Parser(TrainablePipe):
             for j in range(self.moves.n_moves):
                 if costs[j] <= 0.0 and j in unseen_classes:
                     unseen_classes.remove(j)
-            cpu_log_loss(c_d_scores,
-                costs, is_valid, &scores[i, 0], d_scores.shape[1])
+            cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0],
+                         d_scores.shape[1])
             c_d_scores += d_scores.shape[1]
         # Note that we don't normalize this. See comment in update() for why.
         if losses is not None:
@@ -791,10 +785,7 @@ cdef class Parser(TrainablePipe):
         long_doc[:N], and another representing long_doc[N:]. In contrast to
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
-        cdef:
-            StateClass start_state
-            StateClass state
-            Transition action
+        cdef StateClass state
         all_states = self.moves.init_batch(docs)
         states = []
         to_cut = []
@@ -816,7 +807,6 @@ cdef class Parser(TrainablePipe):
                 length += 1
         return states
 
-
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
         sequence or a cap. A long
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index e41f9e02eb3..efca4bcb03b 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,4 +1,3 @@
-import warnings
 from collections.abc import Iterable as IterableInstance
 
 import numpy

From 2b4f892ce759ac4afd35c6e4b64479cc5d726682 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 18 Dec 2023 20:17:24 +0100
Subject: [PATCH 202/504] Bring back W401

---
 spacy/errors.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index a5d0b3d11a9..5d6d65e3b26 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -217,6 +217,11 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
+    # v4 warning strings
+    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
+            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
+            "to return `True` in `.supports_prior_probs`.")
+
 
 class Errors(metaclass=ErrorsWithCodes):
     E001 = ("No component '{name}' found in pipeline. Available names: {opts}")

From ba2968a954b34e28dfec5a050498fad9f24c2aba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 19 Dec 2023 09:28:20 +0100
Subject: [PATCH 203/504] Fix `TransitionBasedParser` version in transformer
 embeddings docs

---
 website/docs/usage/embeddings-transformers.mdx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 2bd2856b6a3..534cf478087 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -140,7 +140,7 @@ factory = "tok2vec"
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -156,7 +156,7 @@ same. This makes them fully independent and doesn't require an upstream
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
@@ -472,7 +472,7 @@ sneakily delegates to the `Transformer` pipeline component.
 factory = "ner"
 
 [nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 128

From 3fdb91a37572d10e1533f3af0a26b3cfe0e36d33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 09:47:38 +0100
Subject: [PATCH 204/504] No need for `Literal` compat, since we only support
 >= 3.8

---
 spacy/compat.py           | 5 -----
 spacy/errors.py           | 1 -
 spacy/ml/models/parser.py | 3 +--
 3 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/spacy/compat.py b/spacy/compat.py
index 30459e2e495..1e63807a0e8 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,11 +23,6 @@
 except ImportError:
     cupy = None
 
-if sys.version_info[:2] >= (3, 8):  # Python 3.8+
-    from typing import Literal, Protocol, runtime_checkable
-else:
-    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
-
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/errors.py b/spacy/errors.py
index 5d6d65e3b26..1af8a3b0891 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1006,7 +1006,6 @@ class Errors(metaclass=ErrorsWithCodes):
     E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
-
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
 # fmt: on
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index f6c0e565dd3..e776174f6ed 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,9 +1,8 @@
-from typing import List, Optional, cast
+from typing import List, Literal, Optional
 
 from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
 from thinc.types import Floats2d
 
-from ...compat import Literal
 from ...errors import Errors
 from ...tokens import Doc
 from ...util import registry

From add27267b991ec563236b1a7e02803ad9fc11819 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 10:06:28 +0100
Subject: [PATCH 205/504] Fix parser distillation test seed

The test would sometimes fail. Rather than increasing test by increasing
training iterations, use a known-good seed.
---
 spacy/tests/parser/test_parse.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index dbede7edd52..f63d56f6922 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -405,6 +405,7 @@ def test_is_distillable():
 @pytest.mark.slow
 @pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
 def test_distill(max_moves):
+    fix_random_seed(0)
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []

From 5ed19778c97c3694ee1f0c2e3be2f44d9fe7f687 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 11:14:35 +0100
Subject: [PATCH 206/504] TransitionBasedParser.v2 in run example output

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/docs/api/cli.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 1fae1dc6cda..cfa99a2b350 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -406,7 +406,7 @@ Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v2
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
@@ -416,7 +416,7 @@ Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v2
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]

From 9f96b81655a7714fc45c7f6544202c9f95984a1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 16 Jan 2024 14:54:26 +0100
Subject: [PATCH 207/504] Update thinc dependency to 9.0.0.dev4

---
 pyproject.toml                         |  2 +-
 requirements.txt                       |  2 +-
 setup.cfg                              | 10 +++++++++-
 spacy/pipeline/edit_tree_lemmatizer.py |  7 ++-----
 spacy/pipeline/morphologizer.pyx       |  7 +++----
 spacy/pipeline/senter.pyx              |  7 ++-----
 spacy/pipeline/tagger.pyx              | 13 ++++++++-----
 spacy/pipeline/transition_parser.pyx   |  4 ++--
 8 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3891d137867..0a5bc162773 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev2,<9.1.0",
+    "thinc>=9.0.0.dev4,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 760f056ebff..305a842c4d9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev0,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev2,<9.1.0
+thinc>=9.0.0.dev4,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 749ac0959ad..09af5a1199f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,6 +30,14 @@ project_urls =
 zip_safe = false
 include_package_data = true
 python_requires = >=3.8
+setup_requires =
+    cython>=0.25,<3.0
+    numpy>=1.15.0
+    # We also need our Cython packages here to compile against
+    cymem>=2.0.2,<2.1.0
+    preshed>=3.0.2,<3.1.0
+    murmurhash>=0.28.0,<1.1.0
+    thinc>=9.0.0.dev4,<9.1.0
 install_requires =
     # Our libraries
     spacy-legacy>=4.0.0.dev0,<4.1.0
@@ -37,7 +45,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev2,<9.1.0
+    thinc>=9.0.0.dev4,<9.1.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 046ef19c3d5..1a29735e8e8 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -5,7 +5,6 @@
 import numpy as np
 import srsly
 from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
@@ -128,9 +127,7 @@ def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
     ) -> Tuple[float, List[Floats2d]]:
         validate_examples(examples, "EditTreeLemmatizer.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(
-            normalize=False, missing_value=-1
-        )
+        loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
 
         truths = []
         for eg in examples:
@@ -166,7 +163,7 @@ def get_teacher_student_loss(
 
         DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 669a5424412..0f77326e67d 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -2,9 +2,7 @@
 from itertools import islice
 from typing import Callable, Dict, Iterable, Optional, Union
 
-from thinc.api import Config, Model
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 
 from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
@@ -296,7 +294,8 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
+                                                    label_smoothing=self.cfg["label_smoothing"])
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 42615e194e0..51670dcf8cf 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -2,10 +2,7 @@
 from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Union
 
-import srsly
-from thinc.api import Config, Model
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 
 from ..tokens.doc cimport Doc
 
@@ -160,7 +157,7 @@ class SentenceRecognizer(Tagger):
         """
         validate_examples(examples, "SentenceRecognizer.get_loss")
         labels = self.labels
-        loss_func = LegacySequenceCategoricalCrossentropy(names=labels, normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
         truths = []
         for eg in examples:
             eg_truth = []
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index f3d0527ea0b..21c7b3ab0a3 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -4,9 +4,7 @@ from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy
-import srsly
-from thinc.api import Config, Model, set_dropout_rate
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
 from thinc.types import Floats2d, Ints1d
 
 from ..morphology cimport Morphology
@@ -275,7 +273,7 @@ class Tagger(TrainablePipe):
         
         DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))
@@ -292,7 +290,12 @@ class Tagger(TrainablePipe):
         DOCS: https://spacy.io/api/tagger#get_loss
         """
         validate_examples(examples, "Tagger.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
+        loss_func = SequenceCategoricalCrossentropy(
+            names=self.labels,
+            normalize=False,
+            neg_prefix=self.cfg["neg_prefix"],
+            label_smoothing=self.cfg["label_smoothing"]
+        )
         # Convert empty tag "" to missing value None so that both misaligned
         # tokens and tokens with missing annotation have the default missing
         # value None.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index fa9a76772ec..c728f1b7909 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -22,13 +22,13 @@ from thinc.api import (
     CupyOps,
     NumpyOps,
     Optimizer,
+    SequenceCategoricalCrossentropy,
     chain,
     get_ops,
     set_dropout_rate,
     softmax_activation,
     use_ops,
 )
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d
 
 from ..ml.parser_model cimport (
@@ -355,7 +355,7 @@ cdef class Parser(TrainablePipe):
         
         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))

From cc72da22279aff09df52000377a25ac6bc92798a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 17 Jan 2024 09:53:01 +0100
Subject: [PATCH 208/504] Temporily xfail local remote storage test

---
 spacy/tests/test_cli.py | 61 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index a47f03e8ab4..c9e823ffe68 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -878,6 +878,67 @@ def test_applycli_user_data():
         assert result[0]._.ext == val
 
 
+# TODO: remove this xfail after merging master into v4. The issue
+#       is that for local files, pathy started returning os.stat_result,
+#       which doesn't have a last_modified property. So, recency-sorting
+#       fails and the test fails. However, once we merge master into
+#       v4, we'll use weasel, which in turn uses cloudpathlib, which
+#       should resolve this issue.
+@pytest.mark.xfail(reason="Recency sorting is broken on some platforms")
+def test_local_remote_storage():
+    with make_tempdir() as d:
+        filename = "a.txt"
+
+        content_hashes = ("aaaa", "cccc", "bbbb")
+        for i, content_hash in enumerate(content_hashes):
+            # make sure that each subsequent file has a later timestamp
+            if i > 0:
+                time.sleep(1)
+            content = f"{content_hash} content"
+            loc_file = d / "root" / filename
+            if not loc_file.parent.exists():
+                loc_file.parent.mkdir(parents=True)
+            with loc_file.open(mode="w") as file_:
+                file_.write(content)
+
+            # push first version to remote storage
+            remote = RemoteStorage(d / "root", str(d / "remote"))
+            remote.push(filename, "aaaa", content_hash)
+
+            # retrieve with full hashes
+            loc_file.unlink()
+            remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
+            with loc_file.open(mode="r") as file_:
+                assert file_.read() == content
+
+            # retrieve with command hash
+            loc_file.unlink()
+            remote.pull(filename, command_hash="aaaa")
+            with loc_file.open(mode="r") as file_:
+                assert file_.read() == content
+
+            # retrieve with content hash
+            loc_file.unlink()
+            remote.pull(filename, content_hash=content_hash)
+            with loc_file.open(mode="r") as file_:
+                assert file_.read() == content
+
+            # retrieve with no hashes
+            loc_file.unlink()
+            remote.pull(filename)
+            with loc_file.open(mode="r") as file_:
+                assert file_.read() == content
+
+
+def test_local_remote_storage_pull_missing():
+    # pulling from a non-existent remote pulls nothing gracefully
+    with make_tempdir() as d:
+        filename = "a.txt"
+        remote = RemoteStorage(d / "root", str(d / "remote"))
+        assert remote.pull(filename, command_hash="aaaa") is None
+        assert remote.pull(filename) is None
+
+
 def test_cli_find_threshold(capsys):
     def make_examples(nlp: Language) -> List[Example]:
         docs: List[Example] = []

From 64bad1c50b70625a9ca5e3fb52d660709e487508 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 30 Oct 2023 17:02:08 +0100
Subject: [PATCH 209/504] Add note in docs on `score_weight` config if using a
 non-default `spans_key` for SpanCat (#13093)

* Add note on score_weight if using a non-default span_key for SpanCat.

* Fix formatting.

* Fix formatting.

* Fix typo.

* Use warning infobox.

* Fix infobox formatting.
---
 website/docs/api/spancategorizer.mdx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx
index 258db794786..33219751ca6 100644
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@@ -75,8 +75,7 @@ architectures and their arguments and hyperparameters.
 <Infobox variant="warning">
 
 If you set a non-default value for `spans_key`, you'll have to update
-`[training.score_weights]` as well so that weights are computed properly. E. g.
-for `spans_key == "myspankey"`, include this in your config:
+`[training.score_weights]` as well so that weights are computed properly. E. g. for `span_key == "myspankey"`, include this in your config:
 
 ```ini
 [training.score_weights]

From a3af2de1bd0e1da5e220afcd06cc9e8c3371a627 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 10 Nov 2023 08:05:07 +0100
Subject: [PATCH 210/504] Warn about reloading dependencies after downloading
 models (#13081)

* Update the "Missing factory" error message

This accounts for model installations that took place during the current Python session.

* Add a note about Jupyter notebooks

* Move error to `spacy.cli.download`
Add extra message for Jupyter sessions

* Add additional note for interactive sessions

* Remove note about `spacy-transformers` from error message

* `isort`

* Improve checks for colab (also helps displacy)

* Update warning messages

* Improve flow for multiple checks

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/download.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0635522930b..5e460717cc4 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,10 +7,11 @@
 from wasabi import msg
 
 from .. import about
+from ..errors import OLD_MODEL_SHORTCUTS
 from ..util import (
-    get_installed_models,
     get_minor_version,
-    get_package_version,
+    is_in_interactive,
+    is_in_jupyter,
     is_package,
     is_prerelease_version,
     run_command,

From 12bdba2a221a3de28df6c219ea6052e6143dcf81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 29 Nov 2023 09:11:54 +0100
Subject: [PATCH 211/504] Update `TextCatBOW` to use the fixed `SparseLinear`
 layer (#13149)

* Update `TextCatBOW` to use the fixed `SparseLinear` layer

A while ago, we fixed the `SparseLinear` layer to use all available
parameters: https://github.com/explosion/thinc/pull/754

This change updates `TextCatBOW` to `v3` which uses the new
`SparseLinear_v2` layer. This results in a sizeable improvement on a
text categorization task that was tested.

While at it, this `spacy.TextCatBOW.v3` also adds the `length_exponent`
option to make it possible to change the hidden size. Ideally, we'd just
have an option called `length`. But the way that `TextCatBOW` uses
hashes results in a non-uniform distribution of parameters when the
length is not a power of two.

* Replace TexCatBOW `length_exponent` parameter by `length`

We now round up the length to the next power of two if it isn't
a power of two.

* Remove some tests for TextCatBOW.v2

* Fix missing import
---
 spacy/errors.py                      |  3 ---
 spacy/tests/pipeline/test_textcat.py |  8 +++---
 website/docs/api/architectures.mdx   | 40 ++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 1af8a3b0891..571335009be 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -974,9 +974,6 @@ class Errors(metaclass=ErrorsWithCodes):
     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
              "but only callbacks with one or three parameters are supported")
     E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
-    E1057 = ("The `TextCatReduce` architecture must be used with at least one "
-             "reduction. Please enable one of `use_reduce_first`, "
-             "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 3f2d757eebc..3653739befd 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -499,9 +499,9 @@ def test_resize(name, textcat_config):
         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
-        # REDUCE
-        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        # CNN
+        ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
     ],
 )
 # fmt: on
@@ -749,7 +749,7 @@ def test_overfitting_IO_multi():
         # ENSEMBLE V2
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
-        # CNN V2 (legacy)
+        # CNN V2
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
         # PARAMETRIC ATTENTION V1
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 956234ac0d4..31beb15644c 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -1020,6 +1020,46 @@ but used an internal `tok2vec` instead of taking it as argument:
 
 ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
 
+> #### Example Config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.TextCatCNN.v2"
+> exclusive_classes = false
+> nO = null
+>
+> [model.tok2vec]
+> @architectures = "spacy.HashEmbedCNN.v2"
+> pretrained_vectors = null
+> width = 96
+> depth = 4
+> embed_size = 2000
+> window_size = 1
+> maxout_pieces = 3
+> subword_features = true
+> ```
+
+A neural network model where token vectors are calculated using a CNN. The
+vectors are mean pooled and used as features in a feed-forward network. This
+architecture is usually less accurate than the ensemble, but runs faster.
+
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
+
+<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
+
+[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
+not yet resizable. Since v2, new labels can be added to this component, even
+after training.
+
+</Accordion>
+
+### spacy.TextCatBOW.v3 {id="TextCatBOW"}
+
 > #### Example Config
 >
 > ```ini

From fd7ad53454d83c51a6befacef0df29958f21b7aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 11:00:06 +0100
Subject: [PATCH 212/504] Add TextCatReduce.v1 (#13181)

* Add TextCatReduce.v1

This is a textcat classifier that pools the vectors generated by a
tok2vec implementation and then applies a classifier to the pooled
representation. Three reductions are supported for pooling: first, max,
and mean. When multiple reductions are enabled, the reductions are
concatenated before providing them to the classification layer.

This model is a generalization of the TextCatCNN model, which only
supports mean reductions and is a bit of a misnomer, because it can also
be used with transformers. This change also reimplements TextCatCNN.v2
using the new TextCatReduce.v1 layer.

* Doc fixes

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fully specify `TextCatCNN` <-> `TextCatReduce` equivalence

* Move TextCatCNN docs to legacy, in prep for moving to spacy-legacy

* Add back a test for TextCatCNN.v2

* Replace TextCatCNN in pipe configurations and templates

* Add an infobox to the `TextCatReduce` section with an `TextCatCNN` anchor

* Add last reduction (`use_reduce_last`)

* Remove non-working TextCatCNN Netlify redirect

* Revert layer changes for the quickstart

* Revert one more quickstart change

* Remove unused import

* Fix docstring

* Fix setting name in error message

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/errors.py                      |  3 +
 spacy/ml/models/textcat.py           | 85 ++++------------------------
 spacy/tests/pipeline/test_textcat.py | 13 ++---
 website/docs/api/architectures.mdx   | 78 -------------------------
 4 files changed, 21 insertions(+), 158 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 571335009be..1af8a3b0891 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -974,6 +974,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
              "but only callbacks with one or three parameters are supported")
     E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
+    E1057 = ("The `TextCatReduce` architecture must be used with at least one "
+             "reduction. Please enable one of `use_reduce_first`, "
+             "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 601c94a7f0a..1a49bac1d9d 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -22,6 +22,9 @@
     reduce_first,
     reduce_last,
     reduce_max,
+    reduce_first,
+    reduce_last,
+    reduce_max,
     reduce_mean,
     reduce_sum,
     residual,
@@ -63,6 +66,15 @@ def build_simple_cnn_text_classifier(
         use_reduce_mean=True,
         nO=nO,
     )
+    return build_reduce_text_classifier(
+        tok2vec=tok2vec,
+        exclusive_classes=exclusive_classes,
+        use_reduce_first=False,
+        use_reduce_last=False,
+        use_reduce_max=False,
+        use_reduce_mean=True,
+        nO=nO,
+    )
 
 
 def resize_and_set_ref(model, new_nO, resizable_layer):
@@ -221,79 +233,6 @@ def build_text_classifier_lowdata(
     return model
 
 
-@registry.architectures("spacy.TextCatParametricAttention.v1")
-def build_textcat_parametric_attention_v1(
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    exclusive_classes: bool,
-    nO: Optional[int] = None,
-) -> Model[List[Doc], Floats2d]:
-    width = tok2vec.maybe_get_dim("nO")
-    parametric_attention = _build_parametric_attention_with_residual_nonlinear(
-        tok2vec=tok2vec,
-        nonlinear_layer=Maxout(nI=width, nO=width),
-        key_transform=Gelu(nI=width, nO=width),
-    )
-    with Model.define_operators({">>": chain}):
-        if exclusive_classes:
-            output_layer = Softmax(nO=nO)
-        else:
-            output_layer = Linear(nO=nO) >> Logistic()
-        model = parametric_attention >> output_layer
-    if model.has_dim("nO") is not False and nO is not None:
-        model.set_dim("nO", cast(int, nO))
-    model.set_ref("output_layer", output_layer)
-    model.attrs["multi_label"] = not exclusive_classes
-
-    return model
-
-
-def _build_parametric_attention_with_residual_nonlinear(
-    *,
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    nonlinear_layer: Model[Floats2d, Floats2d],
-    key_transform: Optional[Model[Floats2d, Floats2d]] = None,
-) -> Model[List[Doc], Floats2d]:
-    with Model.define_operators({">>": chain, "|": concatenate}):
-        width = tok2vec.maybe_get_dim("nO")
-        attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform)
-        norm_layer = LayerNorm(nI=width)
-        parametric_attention = (
-            tok2vec
-            >> list2ragged()
-            >> attention_layer
-            >> reduce_sum()
-            >> residual(nonlinear_layer >> norm_layer >> Dropout(0.0))
-        )
-
-        parametric_attention.init = _init_parametric_attention_with_residual_nonlinear
-
-        parametric_attention.set_ref("tok2vec", tok2vec)
-        parametric_attention.set_ref("attention_layer", attention_layer)
-        parametric_attention.set_ref("key_transform", key_transform)
-        parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
-        parametric_attention.set_ref("norm_layer", norm_layer)
-
-        return parametric_attention
-
-
-def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
-    # When tok2vec is lazily initialized, we need to initialize it before
-    # the rest of the chain to ensure that we can get its width.
-    tok2vec = model.get_ref("tok2vec")
-    tok2vec.initialize(X)
-
-    tok2vec_width = get_tok2vec_width(model)
-    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("key_transform").set_dim("nI", tok2vec_width)
-    model.get_ref("key_transform").set_dim("nO", tok2vec_width)
-    model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
-    init_chain(model, X, Y)
-    return model
-
-
 @registry.architectures("spacy.TextCatReduce.v1")
 def build_reduce_text_classifier(
     tok2vec: Model,
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 3653739befd..9ee93af0fef 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -473,6 +473,8 @@ def test_no_resize(name, textcat_config):
         # CNN
         ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
     ],
 )
 # fmt: on
@@ -499,9 +501,9 @@ def test_resize(name, textcat_config):
         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
-        # CNN
-        ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
+        # REDUCE
+        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
     ],
 )
 # fmt: on
@@ -749,12 +751,9 @@ def test_overfitting_IO_multi():
         # ENSEMBLE V2
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
-        # CNN V2
+        # CNN V2 (legacy)
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
-        # PARAMETRIC ATTENTION V1
-        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
-        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
         # REDUCE V1
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 31beb15644c..63f723a28cf 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -1020,46 +1020,6 @@ but used an internal `tok2vec` instead of taking it as argument:
 
 ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
 
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.TextCatCNN.v2"
-> exclusive_classes = false
-> nO = null
->
-> [model.tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v2"
-> pretrained_vectors = null
-> width = 96
-> depth = 4
-> embed_size = 2000
-> window_size = 1
-> maxout_pieces = 3
-> subword_features = true
-> ```
-
-A neural network model where token vectors are calculated using a CNN. The
-vectors are mean pooled and used as features in a feed-forward network. This
-architecture is usually less accurate than the ensemble, but runs faster.
-
-| Name                | Description                                                                                                                                                                                    |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
-
-<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
-
-[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
-not yet resizable. Since v2, new labels can be added to this component, even
-after training.
-
-</Accordion>
-
-### spacy.TextCatBOW.v3 {id="TextCatBOW"}
-
 > #### Example Config
 >
 > ```ini
@@ -1096,44 +1056,6 @@ the others, but may not be as accurate, especially if texts are short.
 
 </Accordion>
 
-### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"}
-
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.TextCatParametricAttention.v1"
-> exclusive_classes = true
-> nO = null
->
-> [model.tok2vec]
-> @architectures = "spacy.Tok2Vec.v2"
->
-> [model.tok2vec.embed]
-> @architectures = "spacy.MultiHashEmbed.v2"
-> width = 64
-> rows = [2000, 2000, 1000, 1000, 1000, 1000]
-> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
-> include_static_vectors = false
->
-> [model.tok2vec.encode]
-> @architectures = "spacy.MaxoutWindowEncoder.v2"
-> width = ${model.tok2vec.embed.width}
-> window_size = 1
-> maxout_pieces = 3
-> depth = 2
-> ```
-
-A neural network model that is built upon Tok2Vec and uses parametric attention
-to attend to tokens that are relevant to text classification.
-
-| Name                | Description                                                                                                                                                                                    |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`           | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
-
 ### spacy.TextCatReduce.v1 {id="TextCatReduce"}
 
 > #### Example Config

From de0f5940905610c4e3a10ec8d903c69c47e80550 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 2 Jan 2024 10:03:06 +0100
Subject: [PATCH 213/504] Add spacy.TextCatParametricAttention.v1 (#13201)

* Add spacy.TextCatParametricAttention.v1

This layer provides is a simplification of the ensemble classifier that
only uses paramteric attention. We have found empirically that with a
sufficient amount of training data, using the ensemble classifier with
BoW does not provide significant improvement in classifier accuracy.
However, plugging in a BoW classifier does reduce GPU training and
inference performance substantially, since it uses a GPU-only kernel.

* Fix merge fallout
---
 pyproject.toml                       |  5 ++-
 requirements.txt                     |  2 +-
 setup.cfg                            |  4 +-
 spacy/ml/models/textcat.py           | 65 ++++++++++++++++++++++++++++
 spacy/tests/pipeline/test_textcat.py |  3 ++
 website/docs/api/architectures.mdx   | 38 ++++++++++++++++
 6 files changed, 112 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0a5bc162773..bfd7e68d1f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,8 +5,9 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev4,<9.1.0",
-    "numpy>=1.15.0",
+    "thinc>=8.2.2,<8.3.0",
+    "numpy>=1.15.0; python_version < '3.9'",
+    "numpy>=1.25.0; python_version >= '3.9'",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index 305a842c4d9..65fda804bfe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev0,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev4,<9.1.0
+thinc>=8.2.2,<8.3.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 09af5a1199f..71dbf6a5337 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,7 +37,7 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=9.0.0.dev4,<9.1.0
+    thinc>=8.2.2,<8.3.0
 install_requires =
     # Our libraries
     spacy-legacy>=4.0.0.dev0,<4.1.0
@@ -45,7 +45,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev4,<9.1.0
+    thinc>=8.2.2,<8.3.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 1a49bac1d9d..4b3d2de9171 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -233,6 +233,71 @@ def build_text_classifier_lowdata(
     return model
 
 
+@registry.architectures("spacy.TextCatParametricAttention.v1")
+def build_textcat_parametric_attention_v1(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    exclusive_classes: bool,
+    nO: Optional[int] = None,
+) -> Model[List[Doc], Floats2d]:
+    width = tok2vec.maybe_get_dim("nO")
+    parametric_attention = _build_parametric_attention_with_residual_nonlinear(
+        tok2vec=tok2vec,
+        nonlinear_layer=Maxout(nI=width, nO=width),
+        key_transform=Gelu(nI=width, nO=width),
+    )
+    with Model.define_operators({">>": chain}):
+        if exclusive_classes:
+            output_layer = Softmax(nO=nO)
+        else:
+            output_layer = Linear(nO=nO) >> Logistic()
+        model = parametric_attention >> output_layer
+    if model.has_dim("nO") is not False and nO is not None:
+        model.set_dim("nO", cast(int, nO))
+    model.set_ref("output_layer", output_layer)
+    model.attrs["multi_label"] = not exclusive_classes
+
+    return model
+
+
+def _build_parametric_attention_with_residual_nonlinear(
+    *,
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    nonlinear_layer: Model[Floats2d, Floats2d],
+    key_transform: Optional[Model[Floats2d, Floats2d]] = None,
+) -> Model[List[Doc], Floats2d]:
+    with Model.define_operators({">>": chain, "|": concatenate}):
+        width = tok2vec.maybe_get_dim("nO")
+        attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform)
+        norm_layer = LayerNorm(nI=width)
+        parametric_attention = (
+            tok2vec
+            >> list2ragged()
+            >> attention_layer
+            >> reduce_sum()
+            >> residual(nonlinear_layer >> norm_layer >> Dropout(0.0))
+        )
+
+        parametric_attention.init = _init_parametric_attention_with_residual_nonlinear
+
+        parametric_attention.set_ref("tok2vec", tok2vec)
+        parametric_attention.set_ref("attention_layer", attention_layer)
+        parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
+        parametric_attention.set_ref("norm_layer", norm_layer)
+
+        return parametric_attention
+
+
+def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
+    tok2vec_width = get_tok2vec_width(model)
+    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
+    model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
+    model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
+    model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
+    model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
+    init_chain(model, X, Y)
+    return model
+
+
 @registry.architectures("spacy.TextCatReduce.v1")
 def build_reduce_text_classifier(
     tok2vec: Model,
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 9ee93af0fef..2bba40d1d13 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -754,6 +754,9 @@ def test_overfitting_IO_multi():
         # CNN V2 (legacy)
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
+        # PARAMETRIC ATTENTION V1
+        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
+        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
         # REDUCE V1
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 63f723a28cf..956234ac0d4 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -1056,6 +1056,44 @@ the others, but may not be as accurate, especially if texts are short.
 
 </Accordion>
 
+### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"}
+
+> #### Example Config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.TextCatParametricAttention.v1"
+> exclusive_classes = true
+> nO = null
+>
+> [model.tok2vec]
+> @architectures = "spacy.Tok2Vec.v2"
+>
+> [model.tok2vec.embed]
+> @architectures = "spacy.MultiHashEmbed.v2"
+> width = 64
+> rows = [2000, 2000, 1000, 1000, 1000, 1000]
+> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+> include_static_vectors = false
+>
+> [model.tok2vec.encode]
+> @architectures = "spacy.MaxoutWindowEncoder.v2"
+> width = ${model.tok2vec.embed.width}
+> window_size = 1
+> maxout_pieces = 3
+> depth = 2
+> ```
+
+A neural network model that is built upon Tok2Vec and uses parametric attention
+to attend to tokens that are relevant to text classification.
+
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`           | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
+
 ### spacy.TextCatReduce.v1 {id="TextCatReduce"}
 
 > #### Example Config

From b4dcad00959cc6aa2ff3209ead81694ce3d0bacf Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 4 Dec 2023 15:23:28 +0100
Subject: [PATCH 214/504] Add documentation for EL task (#12988)

* Add documentation for EL task.

* Fix EL factory name.

* Add llm_entity_linker_mentio.

* Apply suggestions from code review

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Incorporate feedback.

* Format.

* Fix link to KB data.

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 website/docs/api/large-language-models.mdx | 172 ++++++++++++++++++++-
 1 file changed, 169 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/large-language-models.mdx b/website/docs/api/large-language-models.mdx
index cefd5c66ee1..583aa25a44d 100644
--- a/website/docs/api/large-language-models.mdx
+++ b/website/docs/api/large-language-models.mdx
@@ -20,9 +20,10 @@ An LLM component is implemented through the `LLMWrapper` class. It is accessible
 through a generic `llm`
 [component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories)
 as well as through task-specific component factories: `llm_ner`, `llm_spancat`,
-`llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization`,
-`llm_entity_linker`, `llm_raw` and `llm_translation`. For these factories, the
-GPT-3-5 model from OpenAI is used by default, but this can be customized.
+`llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization` and
+`llm_entity_linker`.
+
+### LLMWrapper.\_\_init\_\_ {id="init",tag="method"}
 
 > #### Example
 >
@@ -687,6 +688,171 @@ for a toy example of how such a KB file might look like.
 | -------- | ------------------------------------- |
 | `path`   | Path to KB file. ~~Union[str, Path]~~ |
 
+### EL (Entity Linking) {id="nel"}
+
+The EL links recognized entities (see [NER](#ner)) to those in a knowledge base
+(KB). The EL task prompts the LLM to select the most likely candidate from the
+KB, whose structure can be arbitrary.
+
+Note that the documents processed by the entity linking task are expected to
+have recognized entities in their `.ents` attribute. This can be achieved by
+either running the [NER task](#ner), using a trained spaCy NER model or setting
+the entities manually prior to running the EL task.
+
+In order to be able to pull data from the KB, an object implementing the
+`CandidateSelector` protocol has to be provided. This requires two functions:
+(1) `__call__()` to fetch candidate entities for entity mentions in the text
+(assumed to be available in `Doc.ents`) and (2) `get_entity_description()` to
+fetch descriptions for any given entity ID. Descriptions can be empty, but
+ideally provide more context for entities stored in the KB.
+
+`spacy-llm` provides a `CandidateSelector` implementation
+(`spacy.CandidateSelector.v1`) that leverages a spaCy knowledge base - as used
+in an `entity_linking` component - to select candidates. This knowledge base can
+be loaded from an existing spaCy pipeline (note that the pipeline's EL component
+doesn't have to be trained) or from a separate .yaml file.
+
+#### spacy.EntityLinker.v1 {id="el-v1"}
+
+Supports zero- and few-shot prompting. Relies on a configurable component
+suggesting viable entities before letting the LLM pick the most likely
+candidate.
+
+> #### Example config for spacy.EntityLinker.v1
+>
+> ```ini
+> [paths]
+> el_nlp = null
+>
+> ...
+>
+> [components.llm.task]
+> @llm_tasks = "spacy.EntityLinker.v1"
+>
+> [initialize]
+> [initialize.components]
+> [initialize.components.llm]
+> [initialize.components.llm.candidate_selector]
+> @llm_misc = "spacy.CandidateSelector.v1"
+>
+> # Load a KB from a KB file. For loading KBs from spaCy pipelines see spacy.KBObjectLoader.v1.
+> [initialize.components.llm.candidate_selector.kb_loader]
+> @llm_misc = "spacy.KBFileLoader.v1"
+> # Path to knowledge base .yaml file.
+> path = ${paths.el_kb}
+> ```
+
+| Argument              | Description                                                                                                                                                                                   |
+| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `template`            | Custom prompt template to send to LLM model. Defaults to [entity_linker.v1.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/entity_linker.v1.jinja). ~~str~~ |
+| `parse_responses`     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[EntityLinkerTask]]~~                                   |
+| `prompt_example_type` | Type to use for fewshot examples. Defaults to `ELExample`. ~~Optional[Type[FewshotExample]]~~                                                                                                 |
+| `examples`            | Optional callable that reads a file containing task examples for few-shot learning. If `None` is passed, zero-shot learning will be used. Defaults to `None`. ~~ExamplesConfigType~~          |
+| `scorer`              | Scorer function. Defaults to the metric used by spaCy to evaluate entity linking performance. ~~Optional[Scorer]~~                                                                            |
+
+##### spacy.CandidateSelector.v1 {id="candidate-selector-v1"}
+
+`spacy.CandidateSelector.v1` is an implementation of the `CandidateSelector`
+protocol required by [`spacy.EntityLinker.v1`](#el-v1). The built-in candidate
+selector method allows loading existing knowledge bases in several ways, e. g.
+loading from a spaCy pipeline with a (not necessarily trained) entity linking
+component, and loading from a file describing the knowlege base as a .yaml file.
+Either way the loaded data will be converted to a spaCy `InMemoryLookupKB`
+instance. The KB's selection capabilities are used to select the most likely
+entity candidates for the specified mentions.
+
+> #### Example config for spacy.CandidateSelector.v1
+>
+> ```ini
+> [initialize]
+> [initialize.components]
+> [initialize.components.llm]
+> [initialize.components.llm.candidate_selector]
+> @llm_misc = "spacy.CandidateSelector.v1"
+>
+> # Load a KB from a KB file. For loading KBs from spaCy pipelines see spacy.KBObjectLoader.v1.
+> [initialize.components.llm.candidate_selector.kb_loader]
+> @llm_misc = "spacy.KBFileLoader.v1"
+> # Path to knowledge base .yaml file.
+> path = ${paths.el_kb}
+> ```
+
+| Argument    | Description                                                       |
+| ----------- | ----------------------------------------------------------------- |
+| `kb_loader` | KB loader object. ~~InMemoryLookupKBLoader~~                      |
+| `top_n`     | Top-n candidates to include in the prompt. Defaults to 5. ~~int~~ |
+
+##### spacy.KBObjectLoader.v1 {id="kb-object-loader-v1"}
+
+Adheres to the `InMemoryLookupKBLoader` interface required by
+[`spacy.CandidateSelector.v1`](#candidate-selector-v1). Loads a knowledge base
+from an existing spaCy pipeline.
+
+> #### Example config for spacy.KBObjectLoader.v1
+>
+> ```ini
+> [initialize.components.llm.candidate_selector.kb_loader]
+> @llm_misc = "spacy.KBObjectLoader.v1"
+> # Path to knowledge base directory in serialized spaCy pipeline.
+> path = ${paths.el_kb}
+> # Path to spaCy pipeline. If this is not specified, spacy-llm tries to determine this automatically (but may fail).
+> nlp_path = ${paths.el_nlp}
+> # Path to file with descriptions for entity.
+> desc_path = ${paths.el_desc}
+> ```
+
+| Argument          | Description                                                                                                                                                                                                                         |
+| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path`            | Path to KB file. ~~Union[str, Path]~~                                                                                                                                                                                               |
+| `nlp_path`        | Path to serialized NLP pipeline. If None, path will be guessed. ~~Optional[Union[Path, str]]~~                                                                                                                                      |
+| `desc_path`       | Path to file with descriptions for entities. ~~int~~                                                                                                                                                                                |
+| `ent_desc_reader` | Entity description reader. Defaults to an internal method expecting a CSV file without header row, with ";" as delimiters, and with two columns - one for the entitys' IDs, one for their descriptions. ~~Optional[EntDescReader]~~ |
+
+##### spacy.KBFileLoader.v1 {id="kb-file-loader-v1"}
+
+Adheres to the `InMemoryLookupKBLoader` interface required by
+[`spacy.CandidateSelector.v1`](#candidate-selector-v1). Loads a knowledge base
+from a knowledge base file. The KB .yaml file has to stick to the following
+format:
+
+```yaml
+entities:
+  # The key should be whatever ID identifies this entity uniquely in your knowledge base.
+  ID1:
+      name: "..."
+      desc: "..."
+  ID2:
+      ...
+# Data on aliases in your knowledge base - e. g. "Apple" for the entity "Apple Inc.".
+aliases:
+  - alias: "..."
+    # List of all entities that this alias refers to.
+    entities: ["ID1", "ID2", ...]
+    # Optional: prior probabilities that this alias refers to the n-th entity in the "entities" attribute.
+    probabilities: [0.5, 0.2, ...]
+  - alias: "..."
+    entities: [...]
+    probabilities: [...]
+  ...
+```
+
+See
+[here](https://github.com/explosion/spacy-llm/blob/main/usage_examples/el_openai/el_kb_data.yml)
+for a toy example of how such a KB file might look like.
+
+> #### Example config for spacy.KBFileLoader.v1
+>
+> ```ini
+> [initialize.components.llm.candidate_selector.kb_loader]
+> @llm_misc = "spacy.KBFileLoader.v1"
+> # Path to knowledge base file.
+> path = ${paths.el_kb}
+> ```
+
+| Argument | Description                           |
+| -------- | ------------------------------------- |
+| `path`   | Path to KB file. ~~Union[str, Path]~~ |
+
 ### NER {id="ner"}
 
 The NER task identifies non-overlapping entities in text.

From abe7158e25450b5d8522fa3ffe47a2fef3da85c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 10:28:46 +0100
Subject: [PATCH 215/504] Typing fixes

---
 requirements.txt           |  2 +-
 spacy/tokens/span.pyi      |  2 ++
 spacy/training/example.pyi |  4 ++++
 spacy/training/example.pyx |  6 ++++++
 spacy/training/loop.py     | 26 ++++++++++++++------------
 5 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 65fda804bfe..3b6c7dbdd8e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<1.1.0; platform_machine != "aarch64"
+mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 2a529593e5f..f1030278c69 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -139,6 +139,8 @@ class Span:
     def lemma_(self) -> str: ...
     @property
     def label_(self) -> str: ...
+    @label_.setter
+    def label_(self, label: str): ...
     @property
     def kb_id_(self) -> str: ...
     @property
diff --git a/spacy/training/example.pyi b/spacy/training/example.pyi
index 06639d70c06..33cf07b0902 100644
--- a/spacy/training/example.pyi
+++ b/spacy/training/example.pyi
@@ -9,6 +9,10 @@ def annotations_to_doc(
     tok_annot: Dict[str, Any],
     doc_annot: Dict[str, Any],
 ) -> Doc: ...
+def validate_distillation_examples(
+    examples: Iterable[Example],
+    method: str,
+) -> None: ...
 def validate_examples(
     examples: Iterable[Example],
     method: str,
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index efca4bcb03b..bc6852f83c6 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -57,6 +57,12 @@ def validate_examples(examples, method):
 
 
 def validate_distillation_examples(examples, method):
+    """Check that a batch of examples received during processing is valid
+    for distillation.
+
+    examples (Iterable[Examples]): A batch of examples.
+    method (str): The method name to show in error messages.
+    """
     validate_examples(examples, method)
     for eg in examples:
         if [token.text for token in eg.reference] != [token.text for token in eg.predicted]:
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 63715ec2c42..575a583b78c 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -12,7 +12,9 @@
     Iterable,
     List,
     Optional,
+    Sized,
     Tuple,
+    TypeVar,
     Union,
 )
 
@@ -22,7 +24,6 @@
 from .. import ty
 from ..errors import Errors
 from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
-from ..tokens.doc import Doc
 from ..util import (
     logger,
     registry,
@@ -282,7 +283,7 @@ def _distill_loop(
     teacher: "Language",
     student: "Language",
     optimizer: Optimizer,
-    distill_data: Iterable[List[Example]],
+    distill_data: Iterable[Tuple[int, List[Example]]],
     evaluate: Callable[[], Tuple[float, Dict[str, float]]],
     *,
     dropout: float,
@@ -401,7 +402,7 @@ def _distill_loop(
 def train_while_improving(
     nlp: "Language",
     optimizer: Optimizer,
-    train_data: Iterable[List[Example]],
+    train_data: Iterable[Tuple[int, List[Example]]],
     evaluate: Callable[[], Tuple[float, Dict[str, float]]],
     *,
     dropout: float,
@@ -520,15 +521,16 @@ def train_while_improving(
             break
 
 
+ItemT = TypeVar("ItemT", bound=Sized)
+
+
 def subdivide_batch(
-    batch: Union[Iterable[Doc], Iterable[Example]], accumulate_gradient: int
-):
+    batch: Iterable[ItemT], accumulate_gradient: int
+) -> Iterable[List[ItemT]]:
     batch = list(batch)
     if len(batch):
-        if isinstance(batch[0], Example):
-            batch.sort(key=lambda eg: len(eg.predicted))
-        else:
-            batch.sort(key=lambda doc: len(doc))
+        # Examples are sorted by their predicted length.
+        batch.sort(key=lambda item: len(item))
     sub_len = len(batch) // accumulate_gradient
     start = 0
     for i in range(accumulate_gradient):
@@ -578,7 +580,7 @@ def create_distill_batches(
     corpus: Callable[["Language"], Iterable[Example]],
     batcher: Callable[[Iterable[Example]], Iterable[List[Example]]],
     max_epochs: int,
-):
+) -> Iterable[Tuple[int, List[Example]]]:
     """Create distillation batches. In contrast to training, the corpus
     is normally too large to load into memory and shuffle."""
     epoch = 0
@@ -592,9 +594,9 @@ def create_distill_batches(
 def create_train_batches(
     nlp: "Language",
     corpus: Callable[["Language"], Iterable[Example]],
-    batcher: Callable[[Iterable[Example]], Iterable[Example]],
+    batcher: Callable[[Iterable[Example]], Iterable[List[Example]]],
     max_epochs: int,
-):
+) -> Iterable[Tuple[int, List[Example]]]:
     epoch = 0
     if max_epochs >= 0:
         examples = list(corpus(nlp))  # type: Iterable[Example]

From de0e47c19b57e78c4c244e70b9eaff6f5cd4c118 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 12:20:09 +0100
Subject: [PATCH 216/504] Py_UNICODE is not compatible with 3.12

---
 spacy/pipeline/_parser_internals/search.pyx |  2 +-
 spacy/tests/parser/_search.pyx              | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
index 578299b56ae..52d5cdaa891 100644
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -1,4 +1,4 @@
-# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
+# cython: experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
 from cymem.cymem cimport Pool
 from libc.math cimport exp
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
index cd9e6b2f5ee..ca2a2916094 100644
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@@ -12,7 +12,7 @@ from ..conftest import cytest
 cdef struct TestState:
     int length
     int x
-    Py_UNICODE* string
+    char *string
 
 
 cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1:
@@ -22,7 +22,7 @@ cdef int transition(void* dest, void* src, class_t clas, void* extra_args) excep
     dest_state.x = src_state.x
     dest_state.x += clas
     if extra_args != NULL:
-        dest_state.string = <Py_UNICODE*>extra_args
+        dest_state.string = <char *>extra_args
     else:
         dest_state.string = src_state.string
 
@@ -32,9 +32,9 @@ cdef void* initialize(Pool mem, int n, void* extra_args) except NULL:
     state.length = n
     state.x = 1
     if extra_args == NULL:
-        state.string = u'default'
+        state.string = 'default'
     else:
-        state.string = <Py_UNICODE*>extra_args
+        state.string = <char *>extra_args
     return state
 
 
@@ -77,7 +77,7 @@ def test_initialize(nr_class, beam_width, length):
     for i in range(b.width):
         s = <TestState*>b.at(i)
         assert s.length == length, s.length
-        assert s.string == 'default'
+        assert s.string.decode('utf8') == 'default'
 
 
 @cytest
@@ -88,11 +88,12 @@ def test_initialize(nr_class, beam_width, length):
                          ]
                          )
 def test_initialize_extra(nr_class, beam_width, length, extra):
+    extra = extra.encode("utf-8") if extra is not None else None
     b = Beam(nr_class, beam_width)
     if extra is None:
         b.initialize(initialize, destroy, length, NULL)
     else:
-        b.initialize(initialize, destroy, length, <void*><Py_UNICODE*>extra)
+        b.initialize(initialize, destroy, length, <void*><char*>extra)
     for i in range(b.width):
         s = <TestState*>b.at(i)
         assert s.length == length

From c47d4bd6e8a35196aeb4f3a8a26585077a8621a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 14:59:01 +0100
Subject: [PATCH 217/504] Construct TextCatEnsemble.v2 using helper function

---
 spacy/ml/models/textcat.py | 44 +++++++-------------------------------
 1 file changed, 8 insertions(+), 36 deletions(-)

diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 4b3d2de9171..19ae2579984 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -19,6 +19,7 @@
     clone,
     concatenate,
     list2ragged,
+    noop,
     reduce_first,
     reduce_last,
     reduce_max,
@@ -160,55 +161,26 @@ def build_text_classifier_v2(
     linear_model: Model[List[Doc], Floats2d],
     nO: Optional[int] = None,
 ) -> Model[List[Doc], Floats2d]:
-    # TODO: build the model with _build_parametric_attention_with_residual_nonlinear
-    # in spaCy v4. We don't do this in spaCy v3 to preserve model
-    # compatibility.
+    width = tok2vec.maybe_get_dim("nO")
     exclusive_classes = not linear_model.attrs["multi_label"]
+    parametric_attention = _build_parametric_attention_with_residual_nonlinear(
+        tok2vec=tok2vec,
+        nonlinear_layer=Maxout(nI=width, nO=width),
+        key_transform=noop(),
+    )
     with Model.define_operators({">>": chain, "|": concatenate}):
-        width = tok2vec.maybe_get_dim("nO")
-        attention_layer = ParametricAttention(width)
-        maxout_layer = Maxout(nO=width, nI=width)
-        norm_layer = LayerNorm(nI=width)
-        cnn_model = (
-            tok2vec
-            >> list2ragged()
-            >> attention_layer
-            >> reduce_sum()
-            >> residual(maxout_layer >> norm_layer >> Dropout(0.0))
-        )
-
         nO_double = nO * 2 if nO else None
         if exclusive_classes:
             output_layer = Softmax(nO=nO, nI=nO_double)
         else:
             output_layer = Linear(nO=nO, nI=nO_double) >> Logistic()
-        model = (linear_model | cnn_model) >> output_layer
+        model = (linear_model | parametric_attention) >> output_layer
         model.set_ref("tok2vec", tok2vec)
     if model.has_dim("nO") is not False and nO is not None:
         model.set_dim("nO", cast(int, nO))
     model.set_ref("output_layer", linear_model.get_ref("output_layer"))
-    model.set_ref("attention_layer", attention_layer)
-    model.set_ref("maxout_layer", maxout_layer)
-    model.set_ref("norm_layer", norm_layer)
     model.attrs["multi_label"] = not exclusive_classes
 
-    model.init = init_ensemble_textcat  # type: ignore[assignment]
-    return model
-
-
-def init_ensemble_textcat(model, X, Y) -> Model:
-    # When tok2vec is lazily initialized, we need to initialize it before
-    # the rest of the chain to ensure that we can get its width.
-    tok2vec = model.get_ref("tok2vec")
-    tok2vec.initialize(X)
-
-    tok2vec_width = get_tok2vec_width(model)
-    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("maxout_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("maxout_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
-    init_chain(model, X, Y)
     return model
 
 

From 3e53fc6460481a49d36e026748c2c7178caad6f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 15:02:02 +0100
Subject: [PATCH 218/504] Remove `setup_requires` from `setup.cfg`

---
 setup.cfg | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 71dbf6a5337..ef7d9f7dafc 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,14 +30,6 @@ project_urls =
 zip_safe = false
 include_package_data = true
 python_requires = >=3.8
-setup_requires =
-    cython>=0.25,<3.0
-    numpy>=1.15.0
-    # We also need our Cython packages here to compile against
-    cymem>=2.0.2,<2.1.0
-    preshed>=3.0.2,<3.1.0
-    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.2.2,<8.3.0
 install_requires =
     # Our libraries
     spacy-legacy>=4.0.0.dev0,<4.1.0

From cd85421f353af76a9bb41afa3139b245ea270cc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 17:18:49 +0100
Subject: [PATCH 219/504] Fix up requirements test

To account for buil dependencies being removed from `setup.cfg`.
---
 spacy/tests/package/test_requirements.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index a63b1d8b060..86bdc730c19 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -67,26 +67,28 @@ def test_build_dependencies():
                     "{} and {} respectively".format(lib, v, req_v)
                 )
                 setup_keys.add(lib)
-    assert sorted(setup_keys) == sorted(
-        req_dict.keys()
-    )  # if fail: requirements.txt contains a lib not in setup.cfg
 
     # check pyproject.toml and compare the versions of the libs to requirements.txt
     # does not fail when there are missing or additional libs
     toml_file = root_dir / "pyproject.toml"
     with toml_file.open() as f:
         lines = f.readlines()
+    pyproject_keys = set()
     for line in lines:
         line = line.strip().strip(",").strip('"')
         if not line.startswith("#"):
             lib, v = _parse_req(line)
             if lib and lib not in libs_ignore_requirements:
+                pyproject_keys.add(lib)
                 req_v = req_dict.get(lib, None)
                 assert (lib + v) == (lib + req_v), (
                     "{} has different version in pyproject.toml and in requirements.txt: "
                     "{} and {} respectively".format(lib, v, req_v)
                 )
 
+    # if fail: requirements.txt contains a lib not in setup.cfg or pyproject.toml
+    assert set(setup_keys).union(set(pyproject_keys)) == set(req_dict.keys())
+
 
 def _parse_req(line):
     lib = re.match(r"^[a-z0-9\-]*", line).group(0)

From 4e753e13ebae04fa18e0aa9f96bbdd187619d281 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 25 Jan 2024 12:54:23 +0100
Subject: [PATCH 220/504] Set version to v4.0.0.dev2 (#13269)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 73f201af5fb..ef80718fee0 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "4.0.0.dev1"
+__version__ = "4.0.0.dev2"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From cf9e452932d3b0f97b3e65921ddd85d89feaf3fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 25 Jan 2024 18:24:22 +0100
Subject: [PATCH 221/504] Update `spacy-legacy` dependency to 4.0.0.dev1
 (#13270)

This release is compatible with the parser refactor backout.
---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 3b6c7dbdd8e..80d725dc80c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=4.0.0.dev0,<4.1.0
+spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
diff --git a/setup.cfg b/setup.cfg
index ef7d9f7dafc..2c2d6f379fa 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -32,7 +32,7 @@ include_package_data = true
 python_requires = >=3.8
 install_requires =
     # Our libraries
-    spacy-legacy>=4.0.0.dev0,<4.1.0
+    spacy-legacy>=4.0.0.dev1,<4.1.0
     spacy-loggers>=1.0.0,<2.0.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0

From 32011a5db98870199ec91f26feaa5436f0b13f49 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 6 Feb 2024 14:14:55 +0100
Subject: [PATCH 222/504] Remove debug data normalization for span analysis
 (#13203)

* Remove debug data normalization for span analysis

As a result of this normalization, `debug data` could show a user tokens
that do not exist in their data.

* Update spacy/cli/debug_data.py

---------

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/cli/debug_data.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 714969be145..7a98e6d563c 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1073,8 +1073,7 @@ def _get_distribution(docs, normalize: bool = True) -> Counter:
     word_counts: Counter = Counter()
     for doc in docs:
         for token in doc:
-            # Normalize the text
-            t = token.text.lower().replace("``", '"').replace("''", '"')
+            t = token.text.lower()
             word_counts[t] += 1
     if normalize:
         total = sum(word_counts.values(), 0.0)

From fd32accbe9894f4196fedb35201b25fd08e5254e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 12 Jan 2022 13:38:52 +0100
Subject: [PATCH 223/504] Span/SpanGroup: wrap SpanC in shared_ptr (#9869)

* Span/SpanGroup: wrap SpanC in shared_ptr

When a Span that was retrieved from a SpanGroup was modified, these
changes were not reflected in the SpanGroup because the underlying
SpanC struct was copied.

This change applies the solution proposed by @nrodnova, to wrap SpanC in
a shared_ptr. This makes a SpanGroup and Spans derived from it share the
same SpanC. So, changes made through a Span are visible in the SpanGroup
as well.

Fixes #9556

* Test that a SpanGroup is modified through its Spans

* SpanGroup.push_back: remove nogil

Modifying std::vector is not thread-safe.

* C++ >= 11 does not allow const T in vector<T>

* Add Span.span_c as a shorthand for Span.c.get

Since this method is cdef'ed, it is only visible from Cython, so we
avoid using raw pointers in Python

Replace existing uses of span.c.get() to use this new method.

* Fix formatting

* Style fix: pointer types

* SpanGroup.to_bytes: reduce number of shared_ptr::get calls

* Mark SpanGroup modification test with issue

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/_parser_internals/ner.pyx |  32 ++--
 spacy/tests/doc/test_span.py             |  23 +--
 spacy/tokens/span.pxd                    |  11 +-
 spacy/tokens/span.pyx                    | 181 +++++++++++++----------
 spacy/tokens/span_group.pxd              |   5 +-
 spacy/tokens/span_group.pyx              |  22 +--
 6 files changed, 154 insertions(+), 120 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index e4312bd2f92..c77b7b50f2d 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -1,4 +1,8 @@
-# cython: profile=False
+import os
+import random
+from libc.stdint cimport int32_t
+from libcpp.memory cimport shared_ptr
+from libcpp.vector cimport vector
 from cymem.cymem cimport Pool
 from libc.stdint cimport int32_t
 
@@ -47,9 +51,7 @@ MOVE_NAMES[OUT] = 'O'
 
 cdef struct GoldNERStateC:
     Transition* ner
-    SpanC* negs
-    int32_t length
-    int32_t nr_neg
+    vector[shared_ptr[SpanC]] negs
 
 
 cdef class BiluoGold:
@@ -82,8 +84,6 @@ cdef GoldNERStateC create_gold_state(
         negs = []
     assert example.x.length > 0
     gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
-    gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
-    gs.nr_neg = len(negs)
     ner_ents, ner_tags = example.get_aligned_ents_and_ner()
     for i, ner_tag in enumerate(ner_tags):
         gs.ner[i] = moves.lookup_transition(ner_tag)
@@ -97,8 +97,8 @@ cdef GoldNERStateC create_gold_state(
     # In order to handle negative samples, we need to maintain the full
     # (start, end, label) triple. If we break it down to the 'isnt B-LOC'
     # thing, we'll get blocked if there's an incorrect prefix.
-    for i, neg in enumerate(negs):
-        gs.negs[i] = neg.c
+    for neg in negs:
+        gs.negs.push_back(neg.c)
     return gs
 
 
@@ -413,6 +413,8 @@ cdef class Begin:
         cdef int g_act = gold.ner[b0].move
         cdef attr_t g_tag = gold.ner[b0].label
 
+        cdef shared_ptr[SpanC] span
+
         if g_act == MISSING:
             pass
         elif g_act == BEGIN:
@@ -430,8 +432,8 @@ cdef class Begin:
             # be correct or not. However, we can at least tell whether we're
             # going to be opening an entity where there's only one possible
             # L.
-            for span in gold.negs[:gold.nr_neg]:
-                if span.label == label and span.start == b0:
+            for span in gold.negs:
+                if span.get().label == label and span.get().start == b0:
                     cost += 1
                     break
         return cost
@@ -572,8 +574,9 @@ cdef class Last:
         # If we have negative-example entities, integrate them into the objective,
         # by marking actions that close an entity that we know is incorrect
         # as costly.
-        for span in gold.negs[:gold.nr_neg]:
-            if span.label == label and (span.end-1) == b0 and span.start == ent_start:
+        cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
                 cost += 1
                 break
         return cost
@@ -637,8 +640,9 @@ cdef class Unit:
         # This is fairly straight-forward for U- entities, as we have a single
         # action
         cdef int b0 = s.B(0)
-        for span in gold.negs[:gold.nr_neg]:
-            if span.label == label and span.start == b0 and span.end == (b0+1):
+        cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
                 cost += 1
                 break
         return cost
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 98a74bc2145..e5c71dafcf7 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -5,7 +5,8 @@
 
 from spacy.attrs import LENGTH, ORTH
 from spacy.lang.en import English
-from spacy.tokens import Doc, Span, Token
+from spacy.tokens import Doc, Span, SpanGroup, Token
+from spacy.vocab import Vocab
 from spacy.util import filter_spans
 from spacy.vocab import Vocab
 
@@ -163,16 +164,16 @@ def test_char_span(doc, i_sent, i, j, text):
         assert span.text == text
 
 
-def test_char_span_attributes(doc):
-    label = "LABEL"
-    kb_id = "KB_ID"
-    span_id = "SPAN_ID"
-    span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
-    span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
-    assert span1.text == span2.text
-    assert span1.label_ == span2.label_ == label
-    assert span1.kb_id_ == span2.kb_id_ == kb_id
-    assert span1.id_ == span2.id_ == span_id
+@pytest.mark.issue(9556)
+def test_modify_span_group(doc):
+    group = SpanGroup(doc, spans=doc.ents)
+    for span in group:
+        span.start = 0
+        span.label = doc.vocab.strings["TEST"]
+
+    # Span changes must be reflected in the span group
+    assert group[0].start == 0
+    assert group[0].label == doc.vocab.strings["TEST"]
 
 
 def test_spans_sent_spans(doc):
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index d77bbea7035..ce318ed0dfb 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,3 +1,4 @@
+from libcpp.memory cimport shared_ptr
 cimport numpy as np
 
 from ..structs cimport SpanC
@@ -7,19 +8,21 @@ from .doc cimport Doc
 
 cdef class Span:
     cdef readonly Doc doc
-    cdef SpanC c
+    cdef shared_ptr[SpanC] c
     cdef public _vector
     cdef public _vector_norm
 
     @staticmethod
-    cdef inline Span cinit(Doc doc, SpanC span):
+    cdef inline Span cinit(Doc doc, const shared_ptr[SpanC] &span):
         cdef Span self = Span.__new__(
             Span,
             doc,
-            start=span.start,
-            end=span.end
+            start=span.get().start,
+            end=span.get().end
         )
         self.c = span
         return self
 
     cpdef np.ndarray to_array(self, object features)
+
+    cdef SpanC* span_c(self)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 64b8d7c6c1d..5afe2ffbc8e 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -1,8 +1,7 @@
 # cython: profile=False
 cimport numpy as np
-
-import copy
-import warnings
+from libc.math cimport sqrt
+from libcpp.memory cimport make_shared
 
 import numpy
 from thinc.api import get_array_module
@@ -115,7 +114,7 @@ cdef class Span:
             end_char = start_char
         else:
             end_char = doc[end - 1].idx + len(doc[end - 1])
-        self.c = SpanC(
+        self.c = make_shared[SpanC](SpanC(
             label=label,
             kb_id=kb_id,
             id=span_id,
@@ -123,7 +122,7 @@ cdef class Span:
             end=end,
             start_char=start_char,
             end_char=end_char,
-        )
+        ))
         self._vector = vector
         self._vector_norm = vector_norm
 
@@ -133,32 +132,46 @@ cdef class Span:
                 return False
             else:
                 return True
-        if not isinstance(other, Span):
-            return False
-        cdef Span other_span = other
-        self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc)
-        other_tuple = (other_span.c.start_char, other_span.c.end_char, other_span.c.label, other_span.c.kb_id, other_span.id, other_span.doc)
+
+        cdef SpanC* span_c = self.span_c()
+        cdef SpanC* other_span_c = other.span_c()
+
         # <
         if op == 0:
-            return self_tuple < other_tuple
+            return span_c.start_char < other_span_c.start_char
         # <=
         elif op == 1:
-            return self_tuple <= other_tuple
+            return span_c.start_char <= other_span_c.start_char
         # ==
         elif op == 2:
-            return self_tuple == other_tuple
+            # Do the cheap comparisons first
+            return (
+                (span_c.start_char == other_span_c.start_char) and \
+                (span_c.end_char == other_span_c.end_char) and \
+                (span_c.label == other_span_c.label) and \
+                (span_c.kb_id == other_span_c.kb_id) and \
+                (self.doc == other.doc)
+            )
         # !=
         elif op == 3:
-            return self_tuple != other_tuple
+            # Do the cheap comparisons first
+            return not (
+                (span_c.start_char == other_span_c.start_char) and \
+                (span_c.end_char == other_span_c.end_char) and \
+                (span_c.label == other_span_c.label) and \
+                (span_c.kb_id == other_span_c.kb_id) and \
+                (self.doc == other.doc)
+            )
         # >
         elif op == 4:
-            return self_tuple > other_tuple
+            return span_c.start_char > other_span_c.start_char
         # >=
         elif op == 5:
-            return self_tuple >= other_tuple
+            return span_c.start_char >= other_span_c.start_char
 
     def __hash__(self):
-        return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.c.id))
+        cdef SpanC* span_c = self.span_c()
+        return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id))
 
     def __len__(self):
         """Get the number of tokens in the span.
@@ -167,9 +180,10 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#len
         """
-        if self.c.end < self.c.start:
+        cdef SpanC* span_c = self.span_c()
+        if span_c.end < span_c.start:
             return 0
-        return self.c.end - self.c.start
+        return span_c.end - span_c.start
 
     def __repr__(self):
         return self.text
@@ -183,15 +197,16 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#getitem
         """
+        cdef SpanC* span_c = self.span_c()
         if isinstance(i, slice):
             start, end = normalize_slice(len(self), i.start, i.stop, i.step)
             return Span(self.doc, start + self.start, end + self.start)
         else:
             if i < 0:
-                token_i = self.c.end + i
+                token_i = span_c.end + i
             else:
-                token_i = self.c.start + i
-            if self.c.start <= token_i < self.c.end:
+                token_i = span_c.start + i
+            if span_c.start <= token_i < span_c.end:
                 return self.doc[token_i]
             else:
                 raise IndexError(Errors.E1002)
@@ -203,7 +218,8 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#iter
         """
-        for i in range(self.c.start, self.c.end):
+        cdef SpanC* span_c = self.span_c()
+        for i in range(span_c.start, span_c.end):
             yield self.doc[i]
 
     def __reduce__(self):
@@ -211,9 +227,10 @@ cdef class Span:
 
     @property
     def _(self):
+        cdef SpanC* span_c = self.span_c()
         """Custom extension attributes registered via `set_extension`."""
         return Underscore(Underscore.span_extensions, self,
-                          start=self.c.start_char, end=self.c.end_char)
+                          start=span_c.start_char, end=span_c.end_char)
 
     def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None):
         """Create a `Doc` object with a copy of the `Span`'s data.
@@ -287,13 +304,14 @@ cdef class Span:
         cdef int length = len(array)
         cdef attr_t value
         cdef int i, head_col, ancestor_i
+        cdef SpanC* span_c = self.span_c()
         old_to_new_root = dict()
         if HEAD in attrs:
             head_col = attrs.index(HEAD)
             for i in range(length):
                 # if the HEAD refers to a token outside this span, find a more appropriate ancestor
                 token = self[i]
-                ancestor_i = token.head.i - self.c.start   # span offset
+                ancestor_i = token.head.i - span_c.start   # span offset
                 if ancestor_i not in range(length):
                     if DEP in attrs:
                         array[i, attrs.index(DEP)] = dep
@@ -301,7 +319,7 @@ cdef class Span:
                     # try finding an ancestor within this span
                     ancestors = token.ancestors
                     for ancestor in ancestors:
-                        ancestor_i = ancestor.i - self.c.start
+                        ancestor_i = ancestor.i - span_c.start
                         if ancestor_i in range(length):
                             array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
 
@@ -330,7 +348,8 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#get_lca_matrix
         """
-        return numpy.asarray(_get_lca_matrix(self.doc, self.c.start, self.c.end))
+        cdef SpanC* span_c = self.span_c()
+        return numpy.asarray(_get_lca_matrix(self.doc, span_c.start, span_c.end))
 
     def similarity(self, other):
         """Make a semantic similarity estimate. The default estimate is cosine
@@ -440,6 +459,9 @@ cdef class Span:
         else:
             raise ValueError(Errors.E030)
 
+    cdef SpanC* span_c(self):
+        return self.c.get()
+
     @property
     def sents(self):
         """Obtain the sentences that contain this span. If the given span
@@ -494,10 +516,13 @@ cdef class Span:
         DOCS: https://spacy.io/api/span#ents
         """
         cdef Span ent
+        cdef SpanC* span_c = self.span_c()
+        cdef SpanC* ent_span_c
         ents = []
         for ent in self.doc.ents:
-            if ent.c.start >= self.c.start:
-                if ent.c.end <= self.c.end:
+            ent_span_c = ent.span_c()
+            if ent_span_c.start >= span_c.start:
+                if ent_span_c.end <= span_c.end:
                     ents.append(ent)
                 else:
                     break
@@ -631,11 +656,12 @@ cdef class Span:
         # This should probably be called 'head', and the other one called
         # 'gov'. But we went with 'head' elsewhere, and now we're stuck =/
         cdef int i
+        cdef SpanC* span_c = self.span_c()
         # First, we scan through the Span, and check whether there's a word
         # with head==0, i.e. a sentence root. If so, we can return it. The
         # longer the span, the more likely it contains a sentence root, and
         # in this case we return in linear time.
-        for i in range(self.c.start, self.c.end):
+        for i in range(span_c.start, span_c.end):
             if self.doc.c[i].head == 0:
                 return self.doc[i]
         # If we don't have a sentence root, we do something that's not so
@@ -646,15 +672,15 @@ cdef class Span:
         # think this should be okay.
         cdef int current_best = self.doc.length
         cdef int root = -1
-        for i in range(self.c.start, self.c.end):
-            if self.c.start <= (i+self.doc.c[i].head) < self.c.end:
+        for i in range(span_c.start, span_c.end):
+            if span_c.start <= (i+self.doc.c[i].head) < span_c.end:
                 continue
             words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
             if words_to_root < current_best:
                 current_best = words_to_root
                 root = i
         if root == -1:
-            return self.doc[self.c.start]
+            return self.doc[span_c.start]
         else:
             return self.doc[root]
 
@@ -677,9 +703,10 @@ cdef class Span:
         span_id (Union[int, str]): An identifier to associate with the span.
         RETURNS (Span): The newly constructed object.
         """
-        start_idx += self.c.start_char
-        end_idx += self.c.start_char
-        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
+        cdef SpanC* span_c = self.span_c()
+        start_idx += span_c.start_char
+        end_idx += span_c.start_char
+        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
 
     @property
     def conjuncts(self):
@@ -757,61 +784,55 @@ cdef class Span:
         for word in self.rights:
             yield from word.subtree
 
-    @property
-    def start(self):
-        return self.c.start
+    property start:
+        def __get__(self):
+            return self.span_c().start
 
-    @start.setter
-    def start(self, int start):
-        if start < 0:
-            raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
-        self.c.start = start
+        def __set__(self, int start):
+            if start < 0:
+                raise IndexError("TODO")
+            self.span_c().start = start
 
-    @property
-    def end(self):
-        return self.c.end
+    property end:
+        def __get__(self):
+            return self.span_c().end
 
-    @end.setter
-    def end(self, int end):
-        if end < 0:
-            raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
-        self.c.end = end
+        def __set__(self, int end):
+            if end < 0:
+                raise IndexError("TODO")
+            self.span_c().end = end
 
-    @property
-    def start_char(self):
-        return self.c.start_char
+    property start_char:
+        def __get__(self):
+            return self.span_c().start_char
 
-    @start_char.setter
-    def start_char(self, int start_char):
-        if start_char < 0:
-            raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
-        self.c.start_char = start_char
+        def __set__(self, int start_char):
+            if start_char < 0:
+                raise IndexError("TODO")
+            self.span_c().start_char = start_char
 
-    @property
-    def end_char(self):
-        return self.c.end_char
+    property end_char:
+        def __get__(self):
+            return self.span_c().end_char
 
-    @end_char.setter
-    def end_char(self, int end_char):
-        if end_char < 0:
-            raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
-        self.c.end_char = end_char
+        def __set__(self, int end_char):
+            if end_char < 0:
+                raise IndexError("TODO")
+            self.span_c().end_char = end_char
 
-    @property
-    def label(self):
-        return self.c.label
+    property label:
+        def __get__(self):
+            return self.span_c().label
 
-    @label.setter
-    def label(self, attr_t label):
-        self.c.label = label
+        def __set__(self, attr_t label):
+            self.span_c().label = label
 
-    @property
-    def kb_id(self):
-        return self.c.kb_id
+    property kb_id:
+        def __get__(self):
+            return self.span_c().kb_id
 
-    @kb_id.setter
-    def kb_id(self, attr_t kb_id):
-        self.c.kb_id = kb_id
+        def __set__(self, attr_t kb_id):
+            self.span_c().kb_id = kb_id
 
     @property
     def id(self):
diff --git a/spacy/tokens/span_group.pxd b/spacy/tokens/span_group.pxd
index 7f4145682eb..6f0ffd0eb36 100644
--- a/spacy/tokens/span_group.pxd
+++ b/spacy/tokens/span_group.pxd
@@ -1,3 +1,4 @@
+from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
 
 from ..structs cimport SpanC
@@ -7,6 +8,6 @@ cdef class SpanGroup:
     cdef public object _doc_ref
     cdef public str name
     cdef public dict attrs
-    cdef vector[SpanC] c
+    cdef vector[shared_ptr[SpanC]] c
 
-    cdef void push_back(self, SpanC span) nogil
+    cdef void push_back(self, const shared_ptr[SpanC] &span)
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 257c907bcce..8a524926a03 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -9,6 +9,8 @@ import srsly
 from spacy.errors import Errors
 
 from .span cimport Span
+from libc.stdint cimport uint64_t, uint32_t, int32_t
+from libcpp.memory cimport make_shared
 
 
 cdef class SpanGroup:
@@ -202,10 +204,12 @@ cdef class SpanGroup:
 
         DOCS: https://spacy.io/api/spangroup#to_bytes
         """
+        cdef SpanC* span_c
         output = {"name": self.name, "attrs": self.attrs, "spans": []}
         cdef int i
         for i in range(self.c.size()):
             span = self.c[i]
+            span_c = span.get()
             # The struct.pack here is probably overkill, but it might help if
             # you're saving tonnes of spans, and it doesn't really add any
             # complexity. We do take care to specify little-endian byte order
@@ -217,13 +221,13 @@ cdef class SpanGroup:
             # l: int32_t
             output["spans"].append(struct.pack(
                 ">QQQllll",
-                span.id,
-                span.kb_id,
-                span.label,
-                span.start,
-                span.end,
-                span.start_char,
-                span.end_char
+                span_c.id,
+                span_c.kb_id,
+                span_c.label,
+                span_c.start,
+                span_c.end,
+                span_c.start_char,
+                span_c.end_char
             ))
         return srsly.msgpack_dumps(output)
 
@@ -250,10 +254,10 @@ cdef class SpanGroup:
             span.end = items[4]
             span.start_char = items[5]
             span.end_char = items[6]
-            self.c.push_back(span)
+            self.c.push_back(make_shared[SpanC](span))
         return self
 
-    cdef void push_back(self, SpanC span) nogil:
+    cdef void push_back(self, const shared_ptr[SpanC] &span):
         self.c.push_back(span)
 
     def copy(self, doc: Optional["Doc"] = None) -> SpanGroup:

From 7d13aff497051c9147f3890f41e2ece5bcc335e1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 15 Apr 2022 15:34:58 +0200
Subject: [PATCH 224/504] Return doc offsets in Matcher on spans (#10576)

The returned match offsets were only adjusted for `as_spans`, not
generally. Because the `on_match` callbacks are always applied to the
doc, the `Matcher` matches on spans should consistently use the doc
offsets.
---
 spacy/matcher/matcher.pyx               |  7 ++++---
 spacy/tests/matcher/test_matcher_api.py | 13 ++++++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 9a9ed421223..f0116169a6b 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -275,6 +275,10 @@ cdef class Matcher:
         # non-overlapping ones this `match` can be either (start, end) or
         # (start, end, alignments) depending on `with_alignments=` option.
         for key, *match in matches:
+            # Adjust span matches to doc offsets
+            if isinstance(doclike, Span):
+                match[0] += doclike.start
+                match[1] += doclike.start
             span_filter = self._filter.get(key)
             if span_filter is not None:
                 pairs = pairs_by_id.get(key, [])
@@ -305,9 +309,6 @@ cdef class Matcher:
         if as_spans:
             final_results = []
             for key, start, end, *_ in final_matches:
-                if isinstance(doclike, Span):
-                    start += doclike.start
-                    end += doclike.start
                 final_results.append(Span(doc, start, end, label=key))
         elif with_alignments:
             # convert alignments List[Dict[str, int]] --> List[int]
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index c824ca39253..106a00b3011 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -794,9 +794,16 @@ def test_matcher_span(matcher):
     doc = Doc(matcher.vocab, words=text.split())
     span_js = doc[:3]
     span_java = doc[4:]
-    assert len(matcher(doc)) == 2
-    assert len(matcher(span_js)) == 1
-    assert len(matcher(span_java)) == 1
+    doc_matches = matcher(doc)
+    span_js_matches = matcher(span_js)
+    span_java_matches = matcher(span_java)
+    assert len(doc_matches) == 2
+    assert len(span_js_matches) == 1
+    assert len(span_java_matches) == 1
+
+    # match offsets always refer to the doc
+    assert doc_matches[0] == span_js_matches[0]
+    assert doc_matches[1] == span_java_matches[0]
 
 
 def test_matcher_as_spans(matcher):

From d9ef4460b9787d17a6ab783f18e0cf6ff63b661e Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 15 Jul 2022 11:14:08 +0200
Subject: [PATCH 225/504] `Morphology`/`Morphologizer` optimizations and
 refactoring (#11024)

* `Morphology`: Refactor to use C types, reduce allocations, remove unused code

* `Morphologzier`: Avoid unnecessary sorting of morpho features

* `Morphologizer`: Remove execessive reallocations of labels, improve hash lookups of labels, coerce `numpy` numeric types to native ints
Update docs

* Remove unused method

* Replace `unique_ptr` usage with `shared_ptr`

* Add type annotations to internal Python methods, rename `hash` variable, fix typos

* Add comment to clarify implementation detail

* Fix return type

* `Morphology`: Stop early when splitting fields and values
---
 spacy/morphology.pxd               |  47 +++--
 spacy/morphology.pyx               | 274 +++++++++++++++++------------
 spacy/pipeline/morphologizer.pyx   |  30 ++--
 spacy/structs.pxd                  |   8 -
 spacy/tokens/morphanalysis.pxd     |   9 +-
 spacy/tokens/morphanalysis.pyx     |  40 +++--
 spacy/tokens/token.pyx             |  12 +-
 website/docs/api/morphologizer.mdx |   2 +-
 8 files changed, 244 insertions(+), 178 deletions(-)

diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index ee43aa4ec81..494088879b1 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,27 +1,42 @@
 cimport numpy as np
-from cymem.cymem cimport Pool
-from libc.stdint cimport uint64_t
-from preshed.maps cimport PreshMap
+from libc.stdint cimport uint32_t, uint64_t
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector
+from libcpp.memory cimport shared_ptr
 
 from .strings cimport StringStore
 from .structs cimport MorphAnalysisC
 from .typedefs cimport attr_t, hash_t
 
 
+cdef cppclass Feature:
+    hash_t field
+    hash_t value
+
+    __init__():
+        this.field = 0
+        this.value = 0
+
+
+cdef cppclass MorphAnalysisC:
+    hash_t key  
+    vector[Feature] features
+
+    __init__():
+        this.key = 0
+
 cdef class Morphology:
-    cdef readonly Pool mem
     cdef readonly StringStore strings
-    cdef PreshMap tags  # Keyed by hash, value is pointer to tag
-
-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
-    cdef int insert(self, MorphAnalysisC tag) except -1
+    cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags
 
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash)
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats)
+    cdef hash_t _add(self, features)
+    cdef str _normalize_features(self, features)
+    cdef str get_morph_str(self, hash_t morph_key)
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key)
 
-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
-cdef list list_features(const MorphAnalysisC* morph)
-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
-cdef int get_n_by_field(
-    attr_t* results,
-    const MorphAnalysisC* morph,
-    attr_t field,
-) nogil
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph)
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field)
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index cef45b04d14..7ee621056f1 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,11 +1,11 @@
 # cython: infer_types
 # cython: profile=False
 import warnings
+from typing import Union, Tuple, List, Dict, Optional
+from cython.operator cimport dereference as deref
+from libcpp.memory cimport shared_ptr
 
-import numpy
-
-from .attrs cimport POS
-
+from .errors import Warnings
 from . import symbols
 from .errors import Warnings
 from .parts_of_speech import IDS as POS_IDS
@@ -26,135 +26,187 @@ cdef class Morphology:
     EMPTY_MORPH = symbols.NAMES[symbols._]
 
     def __init__(self, StringStore strings):
-        self.mem = Pool()
         self.strings = strings
-        self.tags = PreshMap()
 
     def __reduce__(self):
         tags = set([self.get(self.strings[s]) for s in self.strings])
         tags -= set([""])
         return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
 
-    def add(self, features):
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash):
+        match = self.tags.find(tag_hash)
+        if match != self.tags.const_end():
+            return deref(match).second
+        else:
+            return shared_ptr[MorphAnalysisC]()
+
+    def _normalize_attr(self, attr_key : Union[int, str], attr_value : Union[int, str]) -> Optional[Tuple[str, Union[str, List[str]]]]:
+        if isinstance(attr_key, (int, str)) and isinstance(attr_value, (int, str)):
+            attr_key = self.strings.as_string(attr_key)
+            attr_value = self.strings.as_string(attr_value)
+
+            # Preserve multiple values as a list
+            if self.VALUE_SEP in attr_value:
+                values = attr_value.split(self.VALUE_SEP)
+                values.sort()
+                attr_value = values
+        else:
+            warnings.warn(Warnings.W100.format(feature={attr_key: attr_value}))
+            return None
+
+        return attr_key, attr_value
+
+    def _str_to_normalized_feat_dict(self, feats: str) -> Dict[str, str]:
+        if not feats or feats == self.EMPTY_MORPH:
+            return {}
+
+        out = []
+        for feat in feats.split(self.FEATURE_SEP):
+            field, values = feat.split(self.FIELD_SEP, 1)
+            normalized_attr = self._normalize_attr(field, values)
+            if normalized_attr is None:
+                continue
+            out.append((normalized_attr[0], normalized_attr[1]))
+        out.sort(key=lambda x: x[0])
+        return dict(out)
+
+    def _dict_to_normalized_feat_dict(self, feats: Dict[Union[int, str], Union[int, str]]) -> Dict[str, str]:
+        out = []
+        for field, values in feats.items():
+            normalized_attr = self._normalize_attr(field, values)
+            if normalized_attr is None:
+                continue
+            out.append((normalized_attr[0], normalized_attr[1]))
+        out.sort(key=lambda x: x[0])
+        return dict(out)
+
+
+    def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
+        norm_feats_string = self.FEATURE_SEP.join([
+                self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
+            for field, values in feats.items()
+        ])
+        return norm_feats_string or self.EMPTY_MORPH
+
+
+    cdef hash_t _add(self, features):
         """Insert a morphological analysis in the morphology table, if not
         already present. The morphological analysis may be provided in the UD
         FEATS format as a string or in the tag map dict format.
         Returns the hash of the new analysis.
         """
-        cdef MorphAnalysisC* tag_ptr
+        cdef hash_t tag_hash = 0
+        cdef shared_ptr[MorphAnalysisC] tag
         if isinstance(features, str):
             if features == "":
                 features = self.EMPTY_MORPH
-            tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
-            if tag_ptr != NULL:
-                return tag_ptr.key
-            features = self.feats_to_dict(features)
-        if not isinstance(features, dict):
+
+            tag_hash = self.strings[features]
+            tag = self._lookup_tag(tag_hash)
+            if tag:
+                return deref(tag).key
+
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
+        else:
             warnings.warn(Warnings.W100.format(feature=features))
             features = {}
-        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # intified ("Field", "Field=Value") pairs
-        field_feature_pairs = []
-        for field in sorted(string_features):
-            values = string_features[field]
-            for value in values.split(self.VALUE_SEP):
-                field_feature_pairs.append((
-                    self.strings.add(field),
-                    self.strings.add(field + self.FIELD_SEP + value),
-                ))
-        cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
+
         # the hash key for the tag is either the hash of the normalized UFEATS
         # string or the hash of an empty placeholder
-        norm_feats_string = self.normalize_features(features)
-        tag.key = self.strings.add(norm_feats_string)
-        self.insert(tag)
-        return tag.key
+        norm_feats_string = self._normalized_feat_dict_to_str(features)
+        tag_hash = self.strings.add(norm_feats_string)
+        tag = self._lookup_tag(tag_hash)
+        if tag:
+            return deref(tag).key
+
+        self._intern_morph_tag(tag_hash, features)
+        return tag_hash
+
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats):
+        # intified ("Field", "Field=Value") pairs where fields with multiple values have
+        # been split into individual tuples, e.g.:
+        # [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
+        # ("Field2", "Field2=Value3")]
+        field_feature_pairs = []
 
-    def normalize_features(self, features):
+        # Feat dict is normalized at this point.
+        for field, values in feats.items():
+            field_key = self.strings.add(field)
+            if isinstance(values, list):
+                for value in values:
+                    value_key = self.strings.add(field + self.FIELD_SEP + value)
+                    field_feature_pairs.append((field_key, value_key))
+            else:
+                # We could box scalar values into a list and use a common
+                # code path to generate features but that incurs a small 
+                # but measurable allocation/iteration overhead (as this
+                # branch is taken often enough).
+                value_key = self.strings.add(field + self.FIELD_SEP + values)
+                field_feature_pairs.append((field_key, value_key))
+
+        num_features = len(field_feature_pairs)
+        cdef shared_ptr[MorphAnalysisC] tag = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
+        deref(tag).key = tag_key
+        deref(tag).features.resize(num_features)
+
+        for i in range(num_features):
+            deref(tag).features[i].field = field_feature_pairs[i][0]
+            deref(tag).features[i].value = field_feature_pairs[i][1]
+
+        self.tags[tag_key] = tag
+
+    cdef str get_morph_str(self, hash_t morph_key):
+        cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key)
+        if not tag:
+            return ""
+        else:
+            return self.strings[deref(tag).key]
+
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
+        return self._lookup_tag(morph_key)
+
+    cdef str _normalize_features(self, features):
         """Create a normalized FEATS string from a features string or dict.
 
         features (Union[dict, str]): Features as dict or UFEATS string.
         RETURNS (str): Features as normalized UFEATS string.
         """
         if isinstance(features, str):
-            features = self.feats_to_dict(features)
-        if not isinstance(features, dict):
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
+        else:
             warnings.warn(Warnings.W100.format(feature=features))
             features = {}
-        features = self.normalize_attrs(features)
-        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # normalized UFEATS string with sorted fields and values
-        norm_feats_string = self.FEATURE_SEP.join(
-            sorted(
-                [self.FIELD_SEP.join([field, values]) for field, values in string_features.items()]
-            )
-        )
-        return norm_feats_string or self.EMPTY_MORPH
 
-    def normalize_attrs(self, attrs):
-        """Convert attrs dict so that POS is always by ID, other features are
-        by string. Values separated by VALUE_SEP are sorted.
-        """
-        out = {}
-        attrs = dict(attrs)
-        for key, value in attrs.items():
-            # convert POS value to ID
-            if key == POS or (isinstance(key, str) and key.upper() == "POS"):
-                if isinstance(value, str) and value.upper() in POS_IDS:
-                    value = POS_IDS[value.upper()]
-                elif isinstance(value, int) and value not in POS_IDS.values():
-                    warnings.warn(Warnings.W100.format(feature={key: value}))
-                    continue
-                out[POS] = value
-            # accept any string or ID fields and values and convert to strings
-            elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
-                key = self.strings.as_string(key)
-                value = self.strings.as_string(value)
-                # sort values
-                if self.VALUE_SEP in value:
-                    value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
-                out[key] = value
-            else:
-                warnings.warn(Warnings.W100.format(feature={key: value}))
-        return out
+        return self._normalized_feat_dict_to_str(features)
 
-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
-        """Creates a MorphAnalysisC from a list of intified
-        ("Field", "Field=Value") tuples where fields with multiple values have
-        been split into individual tuples, e.g.:
-        [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
-        ("Field2", "Field2=Value3")]
-        """
-        cdef MorphAnalysisC tag
-        tag.length = len(field_feature_pairs)
-        if tag.length > 0:
-            tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
-            tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
-        for i, (field, feature) in enumerate(field_feature_pairs):
-            tag.fields[i] = field
-            tag.features[i] = feature
-        return tag
-
-    cdef int insert(self, MorphAnalysisC tag) except -1:
-        cdef hash_t key = tag.key
-        if self.tags.get(key) == NULL:
-            tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
-            tag_ptr[0] = tag
-            self.tags.set(key, <void*>tag_ptr)
-
-    def get(self, hash_t morph):
-        tag = <MorphAnalysisC*>self.tags.get(morph)
-        if tag == NULL:
-            return ""
-        else:
-            return self.strings[tag.key]
+    def add(self, features):
+        return self._add(features)
+
+    def get(self, morph_key):
+        return self.get_morph_str(morph_key)
+
+    def normalize_features(self, features):
+        return self._normalize_features(features)
 
     @staticmethod
-    def feats_to_dict(feats):
+    def feats_to_dict(feats, *, sort_values=True):
         if not feats or feats == Morphology.EMPTY_MORPH:
             return {}
-        return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
-                [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
+
+        out = {}
+        for feat in feats.split(Morphology.FEATURE_SEP):
+            field, values = feat.split(Morphology.FIELD_SEP, 1)
+            if sort_values:
+                values = values.split(Morphology.VALUE_SEP)
+                values.sort()
+                values = Morphology.VALUE_SEP.join(values)
+
+            out[field] = values
+        return out
 
     @staticmethod
     def dict_to_feats(feats_dict):
@@ -163,34 +215,34 @@ cdef class Morphology:
         return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
 
 
-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
     cdef int i
-    for i in range(morph.length):
-        if morph.features[i] == feature:
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].value == feature:
             return True
     return False
 
 
-cdef list list_features(const MorphAnalysisC* morph):
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
     cdef int i
     features = []
-    for i in range(morph.length):
-        features.append(morph.features[i])
+    for i in range(deref(morph).features.size()):
+        features.append(deref(morph).features[i].value)
     return features
 
 
-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field):
-    cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64")
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
+    cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
     n = get_n_by_field(<uint64_t*>results.data, morph, field)
     return results[:n]
 
 
-cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil:
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
     cdef int n_results = 0
     cdef int i
-    for i in range(morph.length):
-        if morph.fields[i] == field:
-            results[n_results] = morph.features[i]
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].field == field:
+            results[n_results] = deref(morph).features[i].value
             n_results += 1
     return n_results
 
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index d415ae43c5c..bdbe75fd824 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -132,8 +132,8 @@ class Morphologizer(Tagger):
 
     @property
     def labels(self):
-        """RETURNS (Tuple[str]): The labels currently added to the component."""
-        return tuple(self.cfg["labels_morph"].keys())
+        """RETURNS (Iterable[str]): The labels currently added to the component."""
+        return self.cfg["labels_morph"].keys()
 
     @property
     def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
@@ -156,7 +156,7 @@ class Morphologizer(Tagger):
         # normalize label
         norm_label = self.vocab.morphology.normalize_features(label)
         # extract separate POS and morph tags
-        label_dict = Morphology.feats_to_dict(label)
+        label_dict = Morphology.feats_to_dict(label, sort_values=False)
         pos = label_dict.get(self.POS_FEAT, "")
         if self.POS_FEAT in label_dict:
             label_dict.pop(self.POS_FEAT)
@@ -194,7 +194,7 @@ class Morphologizer(Tagger):
                         continue
                     morph = str(token.morph)
                     # create and add the combined morph+POS label
-                    morph_dict = Morphology.feats_to_dict(morph)
+                    morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                     if pos:
                         morph_dict[self.POS_FEAT] = pos
                     norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@@ -211,7 +211,7 @@ class Morphologizer(Tagger):
             for i, token in enumerate(example.reference):
                 pos = token.pos_
                 morph = str(token.morph)
-                morph_dict = Morphology.feats_to_dict(morph)
+                morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                 if pos:
                     morph_dict[self.POS_FEAT] = pos
                 norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@@ -235,26 +235,29 @@ class Morphologizer(Tagger):
         cdef Doc doc
         cdef bint overwrite = self.cfg["overwrite"]
         cdef bint extend = self.cfg["extend"]
-        labels = self.labels
+
+        # We require random access for the upcoming ops, so we need
+        # to allocate a compatible container out of the iterable.
+        labels = tuple(self.labels)
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
-                morph = labels[tag_id]
+                morph = labels[int(tag_id)]
                 # set morph
                 if doc.c[j].morph == 0 or overwrite or extend:
                     if overwrite and extend:
                         # morphologizer morph overwrites any existing features
                         # while extending
-                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
-                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
+                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False)
+                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False))
                         doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                     elif extend:
                         # existing features are preserved and any new features
                         # are added
-                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
-                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
+                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False)
+                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False))
                         doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                     else:
                         # clobber
@@ -274,8 +277,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
-                                                    label_smoothing=self.cfg["label_smoothing"])
+        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
         truths = []
         for eg in examples:
             eg_truths = []
@@ -296,7 +298,7 @@ class Morphologizer(Tagger):
                     label = None
                 # Otherwise, generate the combined label
                 else:
-                    label_dict = Morphology.feats_to_dict(morph)
+                    label_dict = Morphology.feats_to_dict(morph, sort_values=False)
                     if pos:
                         label_dict[self.POS_FEAT] = pos
                     label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 8cfcc2964f6..e7513cc11b7 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -57,14 +57,6 @@ cdef struct TokenC:
     hash_t ent_id
 
 
-cdef struct MorphAnalysisC:
-    hash_t key
-    int length
-
-    attr_t* fields
-    attr_t* features
-
-
 # Internal struct, for storage and disambiguation of entities.
 cdef struct KBEntryC:
 
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
index 728f0aaf75a..f866488ecc2 100644
--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@@ -1,9 +1,12 @@
-from ..structs cimport MorphAnalysisC
-from ..typedefs cimport hash_t
 from ..vocab cimport Vocab
+from ..typedefs cimport hash_t
+from ..morphology cimport MorphAnalysisC
+from libcpp.memory cimport shared_ptr
 
 
 cdef class MorphAnalysis:
     cdef readonly Vocab vocab
     cdef readonly hash_t key
-    cdef MorphAnalysisC c
+    cdef shared_ptr[MorphAnalysisC] c
+
+    cdef void _init_c(self, hash_t key)
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index ea5d07fa449..ceaa3ecd04e 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -8,6 +8,13 @@ from ..morphology import Morphology
 from ..morphology cimport check_feature, get_by_field, list_features
 from ..typedefs cimport attr_t, hash_t
 from ..vocab cimport Vocab
+from ..typedefs cimport hash_t, attr_t
+from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
+from libcpp.memory cimport shared_ptr
+from cython.operator cimport dereference as deref
+
+
+cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
 
 
 cdef class MorphAnalysis:
@@ -15,39 +22,38 @@ cdef class MorphAnalysis:
     def __init__(self, Vocab vocab, features=dict()):
         self.vocab = vocab
         self.key = self.vocab.morphology.add(features)
-        analysis = <const MorphAnalysisC*>self.vocab.morphology.tags.get(self.key)
-        if analysis is not NULL:
-            self.c = analysis[0]
+        self._init_c(self.key)
+
+    cdef void _init_c(self, hash_t key):
+        cdef shared_ptr[MorphAnalysisC] analysis = self.vocab.morphology.get_morph_c(key)
+        if analysis:
+            self.c = analysis
         else:
-            memset(&self.c, 0, sizeof(self.c))
+            self.c = EMPTY_MORPH_TAG
 
     @classmethod
     def from_id(cls, Vocab vocab, hash_t key):
         """Create a morphological analysis from a given ID."""
-        cdef MorphAnalysis morph = MorphAnalysis.__new__(MorphAnalysis, vocab)
+        cdef MorphAnalysis morph = MorphAnalysis(vocab)
         morph.vocab = vocab
         morph.key = key
-        analysis = <const MorphAnalysisC*>vocab.morphology.tags.get(key)
-        if analysis is not NULL:
-            morph.c = analysis[0]
-        else:
-            memset(&morph.c, 0, sizeof(morph.c))
+        morph._init_c(key)
         return morph
 
     def __contains__(self, feature):
         """Test whether the morphological analysis contains some feature."""
         cdef attr_t feat_id = self.vocab.strings.as_int(feature)
-        return check_feature(&self.c, feat_id)
+        return check_feature(self.c, feat_id)
 
     def __iter__(self):
         """Iterate over the features in the analysis."""
         cdef attr_t feature
-        for feature in list_features(&self.c):
+        for feature in list_features(self.c):
             yield self.vocab.strings[feature]
 
     def __len__(self):
         """The number of features in the analysis."""
-        return self.c.length
+        return deref(self.c).features.size()
 
     def __hash__(self):
         return self.key
@@ -63,11 +69,7 @@ cdef class MorphAnalysis:
     def get(self, field, default=None):
         """Retrieve feature values by field."""
         cdef attr_t field_id = self.vocab.strings.as_int(field)
-        cdef np.ndarray results = get_by_field(&self.c, field_id)
-        if len(results) == 0:
-            if default is None:
-                default = []
-            return default
+        cdef np.ndarray results = get_by_field(self.c, field_id)
         features = [self.vocab.strings[result] for result in results]
         return [f.split(Morphology.FIELD_SEP)[1] for f in features]
 
@@ -75,7 +77,7 @@ cdef class MorphAnalysis:
         """Produce a json serializable representation as a UD FEATS-style
         string.
         """
-        morph_string = self.vocab.strings[self.c.key]
+        morph_string = self.vocab.strings[deref(self.c).key]
         if morph_string == self.vocab.morphology.EMPTY_MORPH:
             return ""
         return morph_string
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index a3efd5886ee..8daff95b705 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -39,6 +39,7 @@ from .. import parts_of_speech
 from ..attrs import IOB_STRINGS
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
+from cython.operator cimport dereference as deref
 
 
 cdef class Token:
@@ -253,12 +254,11 @@ cdef class Token:
     def morph(self):
         return MorphAnalysis.from_id(self.vocab, self.c.morph)
 
-    @morph.setter
-    def morph(self, MorphAnalysis morph):
-        # Check that the morph has the same vocab
-        if self.vocab != morph.vocab:
-            raise ValueError(Errors.E1013)
-        self.c.morph = morph.c.key
+        def __set__(self, MorphAnalysis morph):
+            # Check that the morph has the same vocab
+            if self.vocab != morph.vocab:
+                raise ValueError(Errors.E1013)
+            self.c.morph = deref(morph.c).key
 
     def set_morph(self, features):
         cdef hash_t key
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 8f189d129c3..ce16f534219 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -402,7 +402,7 @@ coarse-grained POS as the feature `POS`.
 
 | Name        | Description                                            |
 | ----------- | ------------------------------------------------------ |
-| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
+| **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ |
 
 ## Morphologizer.label_data {id="label_data",tag="property",version="3"}
 

From b19ad3b753190e165c22bcc23ac44bc368a2e30b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 29 Jul 2022 15:12:19 +0200
Subject: [PATCH 226/504] precompute_hiddens/Parser: look up CPU ops once (v4)
 (#11068)

* precompute_hiddens/Parser: look up CPU ops once

* precompute_hiddens: make cpu_ops private
---
 spacy/ml/parser_model.pyx            | 8 +++-----
 spacy/pipeline/transition_parser.pxd | 1 +
 spacy/pipeline/transition_parser.pyx | 8 ++------
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index f004c562e7d..cb323e98891 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -386,6 +386,7 @@ cdef class precompute_hiddens:
     cdef bint _is_synchronized
     cdef public object ops
     cdef public object numpy_ops
+    cdef public object _cpu_ops
     cdef np.ndarray _features
     cdef np.ndarray _cached
     cdef np.ndarray bias
@@ -416,6 +417,7 @@ cdef class precompute_hiddens:
         self.nO = cached.shape[2]
         self.ops = lower_model.ops
         self.numpy_ops = NumpyOps()
+        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
         assert activation in (None, "relu", "maxout")
         self.activation = activation
         self._is_synchronized = False
@@ -478,11 +480,7 @@ cdef class precompute_hiddens:
         # - Output from backward on GPU
         bp_hiddens = self._bp_hiddens
 
-        cdef CBlas cblas
-        if isinstance(self.ops, CupyOps):
-            cblas = NUMPY_OPS.cblas()
-        else:
-            cblas = self.ops.cblas()
+        cdef CBlas cblas = self._cpu_ops.cblas()
 
         feat_weights = self.get_feat_weights()
         cdef int[:, ::1] ids = token_ids
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index 7ddb91e0184..7ef20563b12 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -12,6 +12,7 @@ cdef class Parser(TrainablePipe):
     cdef public object _rehearsal_model
     cdef readonly TransitionSystem moves
     cdef public object _multitasks
+    cdef object _cpu_ops
 
     cdef void _parseC(
         self,
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 9a278fc1328..b8ebbf8ca88 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -135,6 +135,7 @@ cdef class Parser(TrainablePipe):
 
         self._rehearsal_model = None
         self.scorer = scorer
+        self._cpu_ops = get_ops("cpu") if isinstance(self.model.ops, CupyOps) else self.model.ops
 
     def __getnewargs_ex__(self):
         """This allows pickling the Parser and its keyword-only init arguments"""
@@ -273,12 +274,7 @@ cdef class Parser(TrainablePipe):
     def greedy_parse(self, docs, drop=0.):
         cdef vector[StateC*] states
         cdef StateClass state
-        ops = self.model.ops
-        cdef CBlas cblas
-        if isinstance(ops, CupyOps):
-            cblas = NUMPY_OPS.cblas()
-        else:
-            cblas = ops.cblas()
+        cdef CBlas cblas = self._cpu_ops.cblas()
         self._ensure_labels_are_added(docs)
         set_dropout_rate(self.model, drop)
         batch = self.moves.init_batch(docs)

From 5713e552ffe3fd449ebe2180974111bcf9a2488f Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 10 Aug 2022 11:44:05 +0200
Subject: [PATCH 227/504] Rename modules for consistency (#11286)

* rename Python module to entity_ruler

* rename Python module to attribute_ruler
---
 spacy/pipeline/__init__.py                               | 6 +++---
 spacy/pipeline/{attributeruler.py => attribute_ruler.py} | 0
 spacy/pipeline/{entityruler.py => entity_ruler.py}       | 0
 website/docs/api/attributeruler.mdx                      | 6 +++---
 website/docs/api/entityruler.mdx                         | 6 +++---
 website/docs/usage/saving-loading.mdx                    | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)
 rename spacy/pipeline/{attributeruler.py => attribute_ruler.py} (100%)
 rename spacy/pipeline/{entityruler.py => entity_ruler.py} (100%)

diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 2c4a5a8a87f..82d24486a27 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -1,9 +1,9 @@
-from .attributeruler import AttributeRuler
+from .attribute_ruler import AttributeRuler
 from .dep_parser import DependencyParser
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
-from .entityruler import EntityRuler
-from .functions import merge_entities, merge_noun_chunks, merge_subtokens
+from .ner import EntityRecognizer
+from .entity_ruler import EntityRuler
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .ner import EntityRecognizer
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attribute_ruler.py
similarity index 100%
rename from spacy/pipeline/attributeruler.py
rename to spacy/pipeline/attribute_ruler.py
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entity_ruler.py
similarity index 100%
rename from spacy/pipeline/entityruler.py
rename to spacy/pipeline/entity_ruler.py
diff --git a/website/docs/api/attributeruler.mdx b/website/docs/api/attributeruler.mdx
index c1831918752..e8cb248f85b 100644
--- a/website/docs/api/attributeruler.mdx
+++ b/website/docs/api/attributeruler.mdx
@@ -1,8 +1,8 @@
 ---
 title: AttributeRuler
 tag: class
-source: spacy/pipeline/attributeruler.py
-version: 3
+source: spacy/pipeline/attribute_ruler.py
+new: 3
 teaser: 'Pipeline component for rule-based token attribute assignment'
 api_string_name: attribute_ruler
 api_trainable: false
@@ -34,7 +34,7 @@ how the component should be configured. You can override its settings via the
 | `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
 
 ```python
-%%GITHUB_SPACY/spacy/pipeline/attributeruler.py
+%%GITHUB_SPACY/spacy/pipeline/attribute_ruler.py
 ```
 
 ## AttributeRuler.\_\_init\_\_ {id="init",tag="method"}
diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx
index 27624398ec6..a35b6e2566c 100644
--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@@ -1,8 +1,8 @@
 ---
 title: EntityRuler
 tag: class
-source: spacy/pipeline/entityruler.py
-version: 2.1
+source: spacy/pipeline/entity_ruler.py
+new: 2.1
 teaser: 'Pipeline component for rule-based named entity recognition'
 api_string_name: entity_ruler
 api_trainable: false
@@ -65,7 +65,7 @@ how the component should be configured. You can override its settings via the
 | `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                 |
 
 ```python
-%%GITHUB_SPACY/spacy/pipeline/entityruler.py
+%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py
 ```
 
 ## EntityRuler.\_\_init\_\_ {id="init",tag="method"}
diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx
index 9a6791d5e0a..b44bd86ed06 100644
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@@ -189,7 +189,7 @@ the data to and from a JSON file.
 >
 > To see custom serialization methods in action, check out the new
 > [`EntityRuler`](/api/entityruler) component and its
-> [source](%%GITHUB_SPACY/spacy/pipeline/entityruler.py). Patterns added to the
+> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the
 > component will be saved to a `.jsonl` file if the pipeline is serialized to
 > disk, and to a bytestring if the pipeline is serialized to bytes. This allows
 > saving out a pipeline with a rule-based entity recognizer and including all

From ea8068541d3026dcec2fc7685c4fd847e74f5fd3 Mon Sep 17 00:00:00 2001
From: antonpibm <51074867+antonpibm@users.noreply.github.com>
Date: Thu, 11 Aug 2022 12:26:26 +0300
Subject: [PATCH 228/504] Match private networks as URLs (#11121)

---
 spacy/lang/tokenizer_exceptions.py | 4 ----
 spacy/tests/tokenizer/test_urls.py | 5 ++++-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index dbf9aab4912..a612ae8ac7e 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -16,10 +16,6 @@
     r"(?:\S+(?::\S*)?@)?"
     r"(?:"
     # IP address exclusion
-    # private & local networks
-    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
-    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
-    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
     # IP address dotted notation octets
     # excludes loopback network 0.0.0.0
     # excludes reserved space >= 224.0.0.0
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index ff8812be183..4753462a506 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -32,6 +32,9 @@
     "http://userid:password@example.com/",
     "http://142.42.1.1/",
     "http://142.42.1.1:8080/",
+    "http://10.140.12.13/foo",
+    "http://10.140.12.13/foo/bar?arg1=baz&arg2=taz",
+    "http://10.1.1.1",
     "http://foo.com/blah_(wikipedia)#cite-1",
     "http://foo.com/blah_(wikipedia)_blah#cite-1",
     "http://foo.com/unicode_(✪)_in_parens",
@@ -93,6 +96,7 @@
     "http://foo.bar/foo(bar)baz quux",
     "http://-error-.invalid/",
     "http://a.b-.co",
+    # Loopback and broadcast addresses should be excluded
     "http://0.0.0.0",
     "http://10.1.1.0",
     "http://10.1.1.255",
@@ -101,7 +105,6 @@
     "http://3628126748",
     "http://.www.foo.bar/",
     "http://.www.foo.bar./",
-    "http://10.1.1.1",
     "NASDAQ:GOOG",
     "http://-a.b.co",
     pytest.param("foo.com", marks=pytest.mark.xfail()),

From 83570cb36e11f067e8307ad7e44b2d0baf4298fd Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 17 Aug 2022 12:13:54 +0200
Subject: [PATCH 229/504] Remove intify_attrs(_do_deprecated) (#11319)

---
 spacy/attrs.pyx                | 71 +---------------------------------
 spacy/tests/lang/test_attrs.py |  8 ----
 spacy/tokenizer.pyx            |  4 +-
 spacy/vocab.pyx                |  3 +-
 4 files changed, 4 insertions(+), 82 deletions(-)

diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 363dd094dcd..0a4aecc5d85 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -98,7 +98,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
 locals().update(IDS)
 
 
-def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
+def intify_attrs(stringy_attrs, strings_map=None):
     """
     Normalize a dictionary of attributes, converting them to ints.
 
@@ -110,75 +110,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
         converted to ints.
     """
     inty_attrs = {}
-    if _do_deprecated:
-        if "F" in stringy_attrs:
-            stringy_attrs["ORTH"] = stringy_attrs.pop("F")
-        if "L" in stringy_attrs:
-            stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
-        if "pos" in stringy_attrs:
-            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
-        if "morph" in stringy_attrs:
-            morphs = stringy_attrs.pop("morph")  # no-cython-lint
-        if "number" in stringy_attrs:
-            stringy_attrs.pop("number")
-        if "tenspect" in stringy_attrs:
-            stringy_attrs.pop("tenspect")
-        morph_keys = [
-            "PunctType",
-            "PunctSide",
-            "Other",
-            "Degree",
-            "AdvType",
-            "Number",
-            "VerbForm",
-            "PronType",
-            "Aspect",
-            "Tense",
-            "PartType",
-            "Poss",
-            "Hyph",
-            "ConjType",
-            "NumType",
-            "Foreign",
-            "VerbType",
-            "NounType",
-            "Gender",
-            "Mood",
-            "Negative",
-            "Tense",
-            "Voice",
-            "Abbr",
-            "Derivation",
-            "Echo",
-            "Foreign",
-            "NameType",
-            "NounType",
-            "NumForm",
-            "NumValue",
-            "PartType",
-            "Polite",
-            "StyleVariant",
-            "PronType",
-            "AdjType",
-            "Person",
-            "Variant",
-            "AdpType",
-            "Reflex",
-            "Negative",
-            "Mood",
-            "Aspect",
-            "Case",
-            "Polarity",
-            "PrepCase",
-            "Animacy",  # U20
-        ]
-        for key in morph_keys:
-            if key in stringy_attrs:
-                stringy_attrs.pop(key)
-            elif key.lower() in stringy_attrs:
-                stringy_attrs.pop(key.lower())
-            elif key.upper() in stringy_attrs:
-                stringy_attrs.pop(key.upper())
     for name, value in stringy_attrs.items():
         int_key = intify_attr(name)
         if int_key is not None:
diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py
index fd96e8f9bd4..0f52c3ed511 100644
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@@ -31,14 +31,6 @@ def test_attrs_idempotence(text):
     assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
 
 
-@pytest.mark.parametrize("text", ["dog"])
-def test_attrs_do_deprecated(text):
-    int_attrs = intify_attrs(
-        {"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True
-    )
-    assert int_attrs == {ORTH: 10, IS_ALPHA: True}
-
-
 def test_attrs_ent_iob_intify():
     int_attrs = intify_attrs({"ENT_IOB": ""})
     assert int_attrs == {ENT_IOB: 0}
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 96545828fde..1ba9381fe46 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -586,7 +586,7 @@ cdef class Tokenizer:
         substrings (iterable): A sequence of dicts, where each dict describes
             a token and its attributes.
         """
-        attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings]
+        attrs = [intify_attrs(spec) for spec in substrings]
         orth = "".join([spec[ORTH] for spec in attrs])
         if chunk != orth:
             raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
@@ -654,7 +654,7 @@ cdef class Tokenizer:
             url_match = re.compile("a^").match
         special_cases = {}
         for orth, special_tokens in self.rules.items():
-            special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
+            special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings) for special_token in special_tokens]
         tokens = []
         for substring in text.split():
             suffixes = []
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 19e6eb005c0..4ecefd8b9dc 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -274,8 +274,7 @@ cdef class Vocab:
         cdef int i
         tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
         for i, props in enumerate(substrings):
-            props = intify_attrs(props, strings_map=self.strings,
-                                 _do_deprecated=True)
+            props = intify_attrs(props, strings_map=self.strings)
             token = &tokens[i]
             # Set the special tokens up to have arbitrary attributes
             lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])

From 49d3da633615edfc6a0e7d4397e394482ac3bc31 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 22 Aug 2022 15:52:24 +0200
Subject: [PATCH 230/504] Cleanup Cython structs (#11337)

* cleanup Tokenizer fields

* remove unused object from vocab

* remove IS_OOV_DEPRECATED

* add back in as FLAG13

* FLAG 18 instead

* import fix

* fix clumpsy fingers

* revert symbol changes in favor of #11352

* bint instead of bool
---
 spacy/tokenizer.pxd |  6 +-----
 spacy/tokenizer.pyx | 17 +++++++++++++++++
 spacy/vocab.pxd     |  1 -
 spacy/vocab.pyi     |  1 -
 spacy/vocab.pyx     |  7 ++-----
 5 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index a902ebad941..f64e0e93413 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -23,11 +23,7 @@ cdef class Tokenizer:
     cdef object _infix_finditer
     cdef object _rules
     cdef PhraseMatcher _special_matcher
-    # TODO convert to bool in v4
-    cdef int _faster_heuristics
-    # TODO next one is unused and should be removed in v4
-    # https://github.com/explosion/spaCy/pull/9150
-    cdef int _unused_int2
+    cdef bint _faster_heuristics
 
     cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
     cdef int _apply_special_cases(self, Doc doc) except -1
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 1ba9381fe46..407ca6ca6de 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -8,11 +8,18 @@ from libcpp.set cimport set as stdset
 from preshed.maps cimport PreshMap
 
 import re
+
+from .tokens.doc cimport Doc
+from .strings cimport hash_string
 from .lexeme cimport EMPTY_LEXEME
 from .strings cimport hash_string
 from .tokens.doc cimport Doc
 
+from .attrs import intify_attrs
+from .symbols import ORTH, NORM
+from .errors import Errors
 from . import util
+from .util import get_words_and_spaces
 from .attrs import intify_attrs
 from .errors import Errors
 from .scorer import Scorer
@@ -128,6 +135,7 @@ cdef class Tokenizer:
         self._specials = PreshMap()
         self._load_special_cases(rules)
 
+<<<<<<< HEAD
     @property
     def faster_heuristics(self):
         return bool(self._faster_heuristics)
@@ -136,6 +144,15 @@ cdef class Tokenizer:
     def faster_heuristics(self, faster_heuristics):
         self._faster_heuristics = bool(faster_heuristics)
         self._reload_special_cases()
+=======
+    property faster_heuristics:
+        def __get__(self):
+            return self._faster_heuristics
+
+        def __set__(self, faster_heuristics):
+            self._faster_heuristics = faster_heuristics
+            self._reload_special_cases()
+>>>>>>> 5abfa8215 (Cleanup Cython structs (#11337))
 
     def __reduce__(self):
         args = (self.vocab,
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 43e47af1dee..b91ce3ab45b 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -32,7 +32,6 @@ cdef class Vocab:
     cdef public object writing_system
     cdef public object get_noun_chunks
     cdef readonly int length
-    cdef public object _unused_object  # TODO remove in v4, see #9150
     cdef public object lex_attr_getters
     cdef public object cfg
 
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
index b7ff20348a0..7f5f23e7847 100644
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@@ -73,7 +73,6 @@ def unpickle_vocab(
     sstore: StringStore,
     vectors: Any,
     morphology: Any,
-    _unused_object: Any,
     lex_attr_getters: Any,
     lookups: Any,
     get_noun_chunks: Any,
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 4ecefd8b9dc..0a8b390ffa9 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -581,21 +581,18 @@ def pickle_vocab(vocab):
     sstore = vocab.strings
     vectors = vocab.vectors
     morph = vocab.morphology
-    _unused_object = vocab._unused_object
     lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
     lookups = vocab.lookups
     get_noun_chunks = vocab.get_noun_chunks
     return (unpickle_vocab,
-            (sstore, vectors, morph, _unused_object, lex_attr_getters, lookups, get_noun_chunks))
+            (sstore, vectors, morph, lex_attr_getters, lookups, get_noun_chunks))
 
 
-def unpickle_vocab(sstore, vectors, morphology, _unused_object,
-                   lex_attr_getters, lookups, get_noun_chunks):
+def unpickle_vocab(sstore, vectors, morphology, lex_attr_getters, lookups, get_noun_chunks):
     cdef Vocab vocab = Vocab()
     vocab.vectors = vectors
     vocab.strings = sstore
     vocab.morphology = morphology
-    vocab._unused_object = _unused_object
     vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
     vocab.lookups = lookups
     vocab.get_noun_chunks = get_noun_chunks

From 51646726b798663a04297c2f0aa7d7173a269c0c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 22 Aug 2022 20:28:57 +0200
Subject: [PATCH 231/504] Make Span/Doc.ents more consistent for ent_kb_id and
 ent_id (#11328)

* Map `Span.id` to `Token.ent_id` in all cases when setting `Doc.ents`
* Reset `Token.ent_id` and `Token.ent_kb_id` when setting `Doc.ents`
* Make `Span.ent_id` an alias of `Span.id` rather than a read-only view
of the root token's `ent_id` annotation
---
 spacy/tests/doc/test_add_entities.py       |  27 ++++
 spacy/tests/doc/test_span.py               |  56 +++-----
 spacy/tokens/doc.pyx                       |  12 +-
 spacy/tokens/span.pyi                      |  24 ++--
 spacy/tokens/span.pyx                      |  66 +++++-----
 website/docs/api/span.mdx                  |  46 +++----
 website/docs/api/token.mdx                 | 144 ++++++++++-----------
 website/docs/usage/rule-based-matching.mdx |   6 +-
 8 files changed, 200 insertions(+), 181 deletions(-)

diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 259b21fb3dd..586b8a745f6 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -46,6 +46,33 @@ def test_ents_reset(en_vocab):
     assert [t.ent_iob_ for t in doc] == orig_iobs
 
 
+def test_ents_clear(en_vocab):
+    """Ensure that removing entities clears token attributes"""
+    text = ["Louisiana", "Office", "of", "Conservation"]
+    doc = Doc(en_vocab, words=text)
+    entity = Span(doc, 0, 4, label=391, span_id="TEST")
+    doc.ents = [entity]
+    doc.ents = []
+    for token in doc:
+        assert token.ent_iob == 2
+        assert token.ent_type == 0
+        assert token.ent_id == 0
+        assert token.ent_kb_id == 0
+    doc.ents = [entity]
+    doc.set_ents([], default="missing")
+    for token in doc:
+        assert token.ent_iob == 0
+        assert token.ent_type == 0
+        assert token.ent_id == 0
+        assert token.ent_kb_id == 0
+    doc.set_ents([], default="blocked")
+    for token in doc:
+        assert token.ent_iob == 3
+        assert token.ent_type == 0
+        assert token.ent_id == 0
+        assert token.ent_kb_id == 0
+
+
 def test_add_overlapping_entities(en_vocab):
     text = ["Louisiana", "Office", "of", "Conservation"]
     doc = Doc(en_vocab, words=text)
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index e5c71dafcf7..ab8538b17dc 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -703,41 +703,21 @@ def test_span_group_copy(doc):
     assert len(doc_copy.spans["test"]) == 2
 
 
-def test_for_partial_ent_sents():
-    """Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
-    which this tests for.
-    """
-    doc = Doc(
-        English().vocab,
-        words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
-        sent_starts=[1, 0, 0, 1, 0, 0],
-    )
-    doc.set_ents([Span(doc, 1, 4, "WORK")])
-    # The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
-    # equal to the sentences referenced in ent.sents.
-    for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
-        assert doc_sent == ent_sent
-
-
-def test_for_no_ent_sents():
-    """Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
-    sentence.
-    """
-    doc = Doc(
-        English().vocab,
-        words=["This", "is", "a", "test.", "ENTITY"],
-        sent_starts=[1, 0, 0, 0, 1],
-    )
-    doc.set_ents([Span(doc, 4, 5, "WORK")])
-    sents = list(doc.ents[0].sents)
-    assert len(sents) == 1
-    assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
-
-
-def test_span_api_richcmp_other(en_tokenizer):
-    doc1 = en_tokenizer("a b")
-    doc2 = en_tokenizer("b c")
-    assert not doc1[1:2] == doc1[1]
-    assert not doc1[1:2] == doc2[0]
-    assert not doc1[1:2] == doc2[0:1]
-    assert not doc1[0:1] == doc2
+@pytest.mark.issue(11113)
+def test_span_ent_id(en_tokenizer):
+    doc = en_tokenizer("a b c d")
+    doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")]
+    span = doc.ents[0]
+    assert doc[1].ent_id_ == "ID0"
+
+    # setting Span.id sets Token.ent_id
+    span.id_ = "ID1"
+    doc.ents = [span]
+    assert doc.ents[0].ent_id_ == "ID1"
+    assert doc[1].ent_id_ == "ID1"
+
+    # Span.ent_id is an alias of Span.id
+    span.ent_id_ = "ID2"
+    doc.ents = [span]
+    assert doc.ents[0].ent_id_ == "ID2"
+    assert doc[1].ent_id_ == "ID2"
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 4d624956968..7384e44cd20 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -844,27 +844,33 @@ cdef class Doc:
                     self.c[i].ent_iob = 1
                 self.c[i].ent_type = span.label
                 self.c[i].ent_kb_id = span.kb_id
-                # for backwards compatibility in v3, only set ent_id from
-                # span.id if it's set, otherwise don't override
-                self.c[i].ent_id = span.id if span.id else self.c[i].ent_id
+                self.c[i].ent_id = span.id
         for span in blocked:
             for i in range(span.start, span.end):
                 self.c[i].ent_iob = 3
                 self.c[i].ent_type = 0
+                self.c[i].ent_kb_id = 0
+                self.c[i].ent_id = 0
         for span in missing:
             for i in range(span.start, span.end):
                 self.c[i].ent_iob = 0
                 self.c[i].ent_type = 0
+                self.c[i].ent_kb_id = 0
+                self.c[i].ent_id = 0
         for span in outside:
             for i in range(span.start, span.end):
                 self.c[i].ent_iob = 2
                 self.c[i].ent_type = 0
+                self.c[i].ent_kb_id = 0
+                self.c[i].ent_id = 0
 
         # Set tokens outside of all provided spans
         if default != SetEntsDefault.unmodified:
             for i in range(self.length):
                 if i not in seen_tokens:
                     self.c[i].ent_type = 0
+                    self.c[i].ent_kb_id = 0
+                    self.c[i].ent_id = 0
                     if default == SetEntsDefault.outside:
                         self.c[i].ent_iob = 2
                     elif default == SetEntsDefault.missing:
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index b982eb810b8..a6731d1c2d4 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -125,15 +125,23 @@ class Span:
     end: int
     start_char: int
     end_char: int
-    label: int
-    kb_id: int
-    id: int
-    ent_id: int
-    ent_id_: str
+    @property
+    def label(self) -> int: ...
+    @property
+    def kb_id(self) -> int: ...
+    @property
+    def id(self) -> int: ...
+    @property
+    def ent_id(self) -> int: ...
     @property
     def orth_(self) -> str: ...
     @property
     def lemma_(self) -> str: ...
-    label_: str
-    kb_id_: str
-    id_: str
+    @property
+    def label_(self) -> str: ...
+    @property
+    def kb_id_(self) -> str: ...
+    @property
+    def id_(self) -> str: ...
+    @property
+    def ent_id_(self) -> str: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 5afe2ffbc8e..191f3783e14 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -834,31 +834,20 @@ cdef class Span:
         def __set__(self, attr_t kb_id):
             self.span_c().kb_id = kb_id
 
-    @property
-    def id(self):
-        return self.c.id
-
-    @id.setter
-    def id(self, attr_t id):
-        self.c.id = id
-
-    @property
-    def ent_id(self):
-        """RETURNS (uint64): The entity ID."""
-        return self.root.ent_id
+    property id:
+        def __get__(self):
+            return self.span_c().id
 
-    @ent_id.setter
-    def ent_id(self, hash_t key):
-        raise NotImplementedError(Errors.E200.format(attr="ent_id"))
+        def __set__(self, attr_t id):
+            self.span_c().id = id
 
-    @property
-    def ent_id_(self):
-        """RETURNS (str): The (string) entity ID."""
-        return self.root.ent_id_
+    property ent_id:
+        """Alias for the span's ID."""
+        def __get__(self):
+            return self.id
 
-    @ent_id_.setter
-    def ent_id_(self, str key):
-        raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
+        def __set__(self, attr_t ent_id):
+            self.id = ent_id
 
     @property
     def orth_(self):
@@ -873,33 +862,42 @@ cdef class Span:
         """RETURNS (str): The span's lemma."""
         return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
 
-    @property
-    def label_(self):
-        """RETURNS (str): The span's label."""
-        return self.doc.vocab.strings[self.label]
+    property label_:
+        """The span's label."""
+        def __get__(self):
+            return self.doc.vocab.strings[self.label]
 
     @label_.setter
     def label_(self, str label_):
         self.label = self.doc.vocab.strings.add(label_)
 
-    @property
-    def kb_id_(self):
-        """RETURNS (str): The span's KB ID."""
-        return self.doc.vocab.strings[self.kb_id]
+    property kb_id_:
+        """The span's KB ID."""
+        def __get__(self):
+            return self.doc.vocab.strings[self.kb_id]
 
     @kb_id_.setter
     def kb_id_(self, str kb_id_):
         self.kb_id = self.doc.vocab.strings.add(kb_id_)
 
-    @property
-    def id_(self):
-        """RETURNS (str): The span's ID."""
-        return self.doc.vocab.strings[self.id]
+    property id_:
+        """The span's ID."""
+        def __get__(self):
+            return self.doc.vocab.strings[self.id]
 
     @id_.setter
     def id_(self, str id_):
         self.id = self.doc.vocab.strings.add(id_)
 
+    property ent_id_:
+        """Alias for the span's ID."""
+        def __get__(self):
+            return self.id_
+
+        def __set__(self, str ent_id_):
+            self.id_ = ent_id_
+
+
 
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index 41422a5b4e1..5e7495f17ca 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -547,26 +547,26 @@ overlaps with will be returned.
 
 ## Attributes {id="attributes"}
 
-| Name           | Description                                                                                                                   |
-| -------------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| `doc`          | The parent document. ~~Doc~~                                                                                                  |
-| `tensor`       | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                              |
-| `start`        | The token offset for the start of the span. ~~int~~                                                                           |
-| `end`          | The token offset for the end of the span. ~~int~~                                                                             |
-| `start_char`   | The character offset for the start of the span. ~~int~~                                                                       |
-| `end_char`     | The character offset for the end of the span. ~~int~~                                                                         |
-| `text`         | A string representation of the span text. ~~str~~                                                                             |
-| `text_with_ws` | The text content of the span with a trailing whitespace character if the last token has one. ~~str~~                          |
-| `orth`         | ID of the verbatim text content. ~~int~~                                                                                      |
-| `orth_`        | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. ~~str~~            |
-| `label`        | The hash value of the span's label. ~~int~~                                                                                   |
-| `label_`       | The span's label. ~~str~~                                                                                                     |
-| `lemma_`       | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~                                      |
-| `kb_id`        | The hash value of the knowledge base ID referred to by the span. ~~int~~                                                      |
-| `kb_id_`       | The knowledge base ID referred to by the span. ~~str~~                                                                        |
-| `ent_id`       | The hash value of the named entity the root token is an instance of. ~~int~~                                                  |
-| `ent_id_`      | The string ID of the named entity the root token is an instance of. ~~str~~                                                   |
-| `id`           | The hash value of the span's ID. ~~int~~                                                                                      |
-| `id_`          | The span's ID. ~~str~~                                                                                                        |
-| `sentiment`    | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
-| `_`            | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
+| Name                                    | Description                                                                                                                   |
+| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| `doc`                                   | The parent document. ~~Doc~~                                                                                                  |
+| `tensor` <Tag variant="new">2.1.7</Tag> | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                              |
+| `start`                                 | The token offset for the start of the span. ~~int~~                                                                           |
+| `end`                                   | The token offset for the end of the span. ~~int~~                                                                             |
+| `start_char`                            | The character offset for the start of the span. ~~int~~                                                                       |
+| `end_char`                              | The character offset for the end of the span. ~~int~~                                                                         |
+| `text`                                  | A string representation of the span text. ~~str~~                                                                             |
+| `text_with_ws`                          | The text content of the span with a trailing whitespace character if the last token has one. ~~str~~                          |
+| `orth`                                  | ID of the verbatim text content. ~~int~~                                                                                      |
+| `orth_`                                 | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. ~~str~~            |
+| `label`                                 | The hash value of the span's label. ~~int~~                                                                                   |
+| `label_`                                | The span's label. ~~str~~                                                                                                     |
+| `lemma_`                                | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~                                      |
+| `kb_id`                                 | The hash value of the knowledge base ID referred to by the span. ~~int~~                                                      |
+| `kb_id_`                                | The knowledge base ID referred to by the span. ~~str~~                                                                        |
+| `ent_id`                                | Alias for `id`: the hash value of the span's ID. ~~int~~                                                                      |
+| `ent_id_`                               | Alias for `id_`: the span's ID. ~~str~~                                                                                       |
+| `id`                                    | The hash value of the span's ID. ~~int~~                                                                                      |
+| `id_`                                   | The span's ID. ~~str~~                                                                                                        |
+| `sentiment`                             | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
+| `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
diff --git a/website/docs/api/token.mdx b/website/docs/api/token.mdx
index 63ee1080bf1..12b99394350 100644
--- a/website/docs/api/token.mdx
+++ b/website/docs/api/token.mdx
@@ -403,75 +403,75 @@ The L2 norm of the token's vector representation.
 
 ## Attributes {id="attributes"}
 
-| Name                               | Description                                                                                                                                                                                                                                                          |
-| ---------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `doc`                              | The parent document. ~~Doc~~                                                                                                                                                                                                                                         |
-| `lex` <Tag variant="new">3</Tag>   | The underlying lexeme. ~~Lexeme~~                                                                                                                                                                                                                                    |
-| `sent`                             | The sentence span that this token is a part of. ~~Span~~                                                                                                                                                                                                             |
-| `text`                             | Verbatim text content. ~~str~~                                                                                                                                                                                                                                       |
-| `text_with_ws`                     | Text content, with trailing space character if present. ~~str~~                                                                                                                                                                                                      |
-| `whitespace_`                      | Trailing space character if present. ~~str~~                                                                                                                                                                                                                         |
-| `orth`                             | ID of the verbatim text content. ~~int~~                                                                                                                                                                                                                             |
-| `orth_`                            | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~                                                                                                                                                  |
-| `vocab`                            | The vocab object of the parent `Doc`. ~~vocab~~                                                                                                                                                                                                                      |
-| `tensor`                           | The token's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                                                                                                                                                                    |
-| `head`                             | The syntactic parent, or "governor", of this token. ~~Token~~                                                                                                                                                                                                        |
-| `left_edge`                        | The leftmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                  |
-| `right_edge`                       | The rightmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                 |
-| `i`                                | The index of the token within the parent document. ~~int~~                                                                                                                                                                                                           |
-| `ent_type`                         | Named entity type. ~~int~~                                                                                                                                                                                                                                           |
-| `ent_type_`                        | Named entity type. ~~str~~                                                                                                                                                                                                                                           |
-| `ent_iob`                          | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~                                                                                 |
-| `ent_iob_`                         | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~                                                                                  |
-| `ent_kb_id`                        | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~                                                                                                                                                                           |
-| `ent_kb_id_`                       | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~                                                                                                                                                                           |
-| `ent_id`                           | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~                                                                                                                                        |
-| `ent_id_`                          | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~                                                                                                                                        |
-| `lemma`                            | Base form of the token, with no inflectional suffixes. ~~int~~                                                                                                                                                                                                       |
-| `lemma_`                           | Base form of the token, with no inflectional suffixes. ~~str~~                                                                                                                                                                                                       |
-| `norm`                             | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~                                                                                                   |
-| `norm_`                            | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~                                                                                                   |
-| `lower`                            | Lowercase form of the token. ~~int~~                                                                                                                                                                                                                                 |
-| `lower_`                           | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~                                                                                                                                                                                        |
-| `shape`                            | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
-| `shape_`                           | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
-| `prefix`                           | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~                                                                                                                                                                           |
-| `prefix_`                          | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~                                                                                                                                                                                         |
-| `suffix`                           | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~                                                                                                                                                                             |
-| `suffix_`                          | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~                                                                                                                                                                                             |
-| `is_alpha`                         | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~                                                                                                                                                                      |
-| `is_ascii`                         | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~                                                                                                                                                          |
-| `is_digit`                         | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~                                                                                                                                                                                     |
-| `is_lower`                         | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~                                                                                                                                                                                            |
-| `is_upper`                         | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~                                                                                                                                                                                            |
-| `is_title`                         | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~                                                                                                                                                                                            |
-| `is_punct`                         | Is the token punctuation? ~~bool~~                                                                                                                                                                                                                                   |
-| `is_left_punct`                    | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~                                                                                                                                                                                                          |
-| `is_right_punct`                   | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~                                                                                                                                                                                                         |
-| `is_sent_start`                    | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`.                                                                                                                                                 |
-| `is_sent_end`                      | Does the token end a sentence? ~~bool~~ or `None` if unknown.                                                                                                                                                                                                        |
-| `is_space`                         | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~                                                                                                                                                                      |
-| `is_bracket`                       | Is the token a bracket? ~~bool~~                                                                                                                                                                                                                                     |
-| `is_quote`                         | Is the token a quotation mark? ~~bool~~                                                                                                                                                                                                                              |
-| `is_currency`                      | Is the token a currency symbol? ~~bool~~                                                                                                                                                                                                                             |
-| `like_url`                         | Does the token resemble a URL? ~~bool~~                                                                                                                                                                                                                              |
-| `like_num`                         | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~                                                                                                                                                                                           |
-| `like_email`                       | Does the token resemble an email address? ~~bool~~                                                                                                                                                                                                                   |
-| `is_oov`                           | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~                                                                                                                                                                                       |
-| `is_stop`                          | Is the token part of a "stop list"? ~~bool~~                                                                                                                                                                                                                         |
-| `pos`                              | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~                                                                                                                                                    |
-| `pos_`                             | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~                                                                                                                                                    |
-| `tag`                              | Fine-grained part-of-speech. ~~int~~                                                                                                                                                                                                                                 |
-| `tag_`                             | Fine-grained part-of-speech. ~~str~~                                                                                                                                                                                                                                 |
-| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~                                                                                                                                                                                                                            |
-| `dep`                              | Syntactic dependency relation. ~~int~~                                                                                                                                                                                                                               |
-| `dep_`                             | Syntactic dependency relation. ~~str~~                                                                                                                                                                                                                               |
-| `lang`                             | Language of the parent document's vocabulary. ~~int~~                                                                                                                                                                                                                |
-| `lang_`                            | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
-| `prob`                             | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
-| `idx`                              | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
-| `sentiment`                        | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
-| `lex_id`                           | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
-| `rank`                             | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
-| `cluster`                          | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
-| `_`                                | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                                                                                                                                        |
+| Name                                         | Description                                                                                                                                                                                                                                                          |
+| -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `doc`                                        | The parent document. ~~Doc~~                                                                                                                                                                                                                                         |
+| `lex` <Tag variant="new">3</Tag>             | The underlying lexeme. ~~Lexeme~~                                                                                                                                                                                                                                    |
+| `sent` <Tag variant="new">2.0.12</Tag>       | The sentence span that this token is a part of. ~~Span~~                                                                                                                                                                                                             |
+| `text`                                       | Verbatim text content. ~~str~~                                                                                                                                                                                                                                       |
+| `text_with_ws`                               | Text content, with trailing space character if present. ~~str~~                                                                                                                                                                                                      |
+| `whitespace_`                                | Trailing space character if present. ~~str~~                                                                                                                                                                                                                         |
+| `orth`                                       | ID of the verbatim text content. ~~int~~                                                                                                                                                                                                                             |
+| `orth_`                                      | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~                                                                                                                                                  |
+| `vocab`                                      | The vocab object of the parent `Doc`. ~~vocab~~                                                                                                                                                                                                                      |
+| `tensor` <Tag variant="new">2.1.7</Tag>      | The token's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                                                                                                                                                                    |
+| `head`                                       | The syntactic parent, or "governor", of this token. ~~Token~~                                                                                                                                                                                                        |
+| `left_edge`                                  | The leftmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                  |
+| `right_edge`                                 | The rightmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                 |
+| `i`                                          | The index of the token within the parent document. ~~int~~                                                                                                                                                                                                           |
+| `ent_type`                                   | Named entity type. ~~int~~                                                                                                                                                                                                                                           |
+| `ent_type_`                                  | Named entity type. ~~str~~                                                                                                                                                                                                                                           |
+| `ent_iob`                                    | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~                                                                                 |
+| `ent_iob_`                                   | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~                                                                                  |
+| `ent_kb_id` <Tag variant="new">2.2</Tag>     | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~                                                                                                                                                                           |
+| `ent_kb_id_` <Tag variant="new">2.2</Tag>    | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~                                                                                                                                                                           |
+| `ent_id`                                     | ID of the entity the token is an instance of, if any. ~~int~~                                                                                                                                                                                                        |
+| `ent_id_`                                    | ID of the entity the token is an instance of, if any. ~~str~~                                                                                                                                                                                                        |
+| `lemma`                                      | Base form of the token, with no inflectional suffixes. ~~int~~                                                                                                                                                                                                       |
+| `lemma_`                                     | Base form of the token, with no inflectional suffixes. ~~str~~                                                                                                                                                                                                       |
+| `norm`                                       | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~                                                                                                   |
+| `norm_`                                      | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~                                                                                                   |
+| `lower`                                      | Lowercase form of the token. ~~int~~                                                                                                                                                                                                                                 |
+| `lower_`                                     | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~                                                                                                                                                                                        |
+| `shape`                                      | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
+| `shape_`                                     | Transform of the token's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
+| `prefix`                                     | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~                                                                                                                                                                           |
+| `prefix_`                                    | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~                                                                                                                                                                                         |
+| `suffix`                                     | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~                                                                                                                                                                             |
+| `suffix_`                                    | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~                                                                                                                                                                                             |
+| `is_alpha`                                   | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~                                                                                                                                                                      |
+| `is_ascii`                                   | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~                                                                                                                                                          |
+| `is_digit`                                   | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~                                                                                                                                                                                     |
+| `is_lower`                                   | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~                                                                                                                                                                                            |
+| `is_upper`                                   | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~                                                                                                                                                                                            |
+| `is_title`                                   | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~                                                                                                                                                                                            |
+| `is_punct`                                   | Is the token punctuation? ~~bool~~                                                                                                                                                                                                                                   |
+| `is_left_punct`                              | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~                                                                                                                                                                                                          |
+| `is_right_punct`                             | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~                                                                                                                                                                                                         |
+| `is_sent_start`                              | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`.                                                                                                                                                 |
+| `is_sent_end`                                | Does the token end a sentence? ~~bool~~ or `None` if unknown.                                                                                                                                                                                                        |
+| `is_space`                                   | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~                                                                                                                                                                      |
+| `is_bracket`                                 | Is the token a bracket? ~~bool~~                                                                                                                                                                                                                                     |
+| `is_quote`                                   | Is the token a quotation mark? ~~bool~~                                                                                                                                                                                                                              |
+| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the token a currency symbol? ~~bool~~                                                                                                                                                                                                                             |
+| `like_url`                                   | Does the token resemble a URL? ~~bool~~                                                                                                                                                                                                                              |
+| `like_num`                                   | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~                                                                                                                                                                                           |
+| `like_email`                                 | Does the token resemble an email address? ~~bool~~                                                                                                                                                                                                                   |
+| `is_oov`                                     | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~                                                                                                                                                                                       |
+| `is_stop`                                    | Is the token part of a "stop list"? ~~bool~~                                                                                                                                                                                                                         |
+| `pos`                                        | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~                                                                                                                                                    |
+| `pos_`                                       | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~                                                                                                                                                    |
+| `tag`                                        | Fine-grained part-of-speech. ~~int~~                                                                                                                                                                                                                                 |
+| `tag_`                                       | Fine-grained part-of-speech. ~~str~~                                                                                                                                                                                                                                 |
+| `morph` <Tag variant="new">3</Tag>           | Morphological analysis. ~~MorphAnalysis~~                                                                                                                                                                                                                            |
+| `dep`                                        | Syntactic dependency relation. ~~int~~                                                                                                                                                                                                                               |
+| `dep_`                                       | Syntactic dependency relation. ~~str~~                                                                                                                                                                                                                               |
+| `lang`                                       | Language of the parent document's vocabulary. ~~int~~                                                                                                                                                                                                                |
+| `lang_`                                      | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
+| `prob`                                       | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
+| `idx`                                        | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
+| `sentiment`                                  | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
+| `lex_id`                                     | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
+| `rank`                                       | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
+| `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
+| `_`                                          | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                                                                                                                                        |
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index e5b98da3a8c..c90172b4325 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1399,14 +1399,14 @@ patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
 ruler.add_patterns(patterns)
 
 doc1 = nlp("Apple is opening its first big office in San Francisco.")
-print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
+print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents])
 
 doc2 = nlp("Apple is opening its first big office in San Fran.")
-print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
+print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
 ```
 
 If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
-patterns, the `ent_id_` property of the matched entity is set to the `id` given
+patterns, the `id_` property of the matched entity is set to the `id` given
 in the patterns. So in the example above it's easy to identify that "San
 Francisco" and "San Fran" are both the same entity.
 

From 926d00b39627275a239316c886f50211c2606349 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 26 Aug 2022 10:11:18 +0200
Subject: [PATCH 232/504] Switch to mecab-ko as default Korean tokenizer
 (#11294)

* Switch to mecab-ko as default Korean tokenizer

Switch to the (confusingly-named) mecab-ko python module for default Korean
tokenization.

Maintain the previous `natto-py` tokenizer as
`spacy.KoreanNattoTokenizer.v1`.

* Temporarily run tests with mecab-ko tokenizer

* Fix types

* Fix duplicate test names

* Update requirements test

* Revert "Temporarily run tests with mecab-ko tokenizer"

This reverts commit d2083e7044403a2046f902b125a147525b703e29.

* Add mecab_args setting, fix pickle for KoreanNattoTokenizer

* Fix length check

* Update docs

* Formatting

* Update natto-py error message

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
---
 setup.cfg                                 |   2 +-
 spacy/lang/ko/__init__.py                 | 121 +++++++++++++++++-----
 spacy/tests/conftest.py                   |  16 ++-
 spacy/tests/lang/ko/test_lemmatization.py |   8 ++
 spacy/tests/lang/ko/test_serialize.py     |  20 ++++
 spacy/tests/lang/ko/test_tokenizer.py     |  42 +++++++-
 spacy/tests/package/test_requirements.py  |   2 +-
 website/docs/usage/models.mdx             |  35 ++++++-
 8 files changed, 212 insertions(+), 34 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index a6b14eb0676..e27f9adeacc 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -122,7 +122,7 @@ ja =
     sudachipy>=0.5.2,!=0.6.1
     sudachidict_core>=20211220
 ko =
-    natto-py>=0.9.0
+    mecab-ko>=1.0.0
 th =
     pythainlp>=2.0
 
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index e2c860f7de9..81052cb24aa 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -17,34 +17,23 @@
 
 [nlp.tokenizer]
 @tokenizers = "spacy.ko.KoreanTokenizer"
+mecab_args = ""
 """
 
 
 @registry.tokenizers("spacy.ko.KoreanTokenizer")
-def create_tokenizer():
+def create_tokenizer(mecab_args: str):
     def korean_tokenizer_factory(nlp):
-        return KoreanTokenizer(nlp.vocab)
+        return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args)
 
     return korean_tokenizer_factory
 
 
 class KoreanTokenizer(DummyTokenizer):
-    def __init__(self, vocab: Vocab):
+    def __init__(self, vocab: Vocab, *, mecab_args: str = ""):
         self.vocab = vocab
-        self._mecab = try_mecab_import()  # type: ignore[func-returns-value]
-        self._mecab_tokenizer = None
-
-    @property
-    def mecab_tokenizer(self):
-        # This is a property so that initializing a pipeline with blank:ko is
-        # possible without actually requiring mecab-ko, e.g. to run
-        # `spacy init vectors ko` for a pipeline that will have a different
-        # tokenizer in the end. The languages need to match for the vectors
-        # to be imported and there's no way to pass a custom config to
-        # `init vectors`.
-        if self._mecab_tokenizer is None:
-            self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
-        return self._mecab_tokenizer
+        mecab = try_mecab_import()
+        self.mecab_tokenizer = mecab.Tagger(mecab_args)
 
     def __reduce__(self):
         return KoreanTokenizer, (self.vocab,)
@@ -67,13 +56,15 @@ def __call__(self, text: str) -> Doc:
     def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
         # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
         # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
-        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
-            if node.is_eos():
+        for line in self.mecab_tokenizer.parse(text).split("\n"):
+            if line == "EOS":
                 break
-            surface = node.surface
-            feature = node.feature
-            tag, _, expr = feature.partition(",")
-            lemma, _, remainder = expr.partition("/")
+            surface, _, expr = line.partition("\t")
+            features = expr.split("/")[0].split(",")
+            tag = features[0]
+            lemma = "*"
+            if len(features) >= 8:
+                lemma = features[7]
             if lemma == "*":
                 lemma = surface
             yield {"surface": surface, "lemma": lemma, "tag": tag}
@@ -96,20 +87,94 @@ class Korean(Language):
     Defaults = KoreanDefaults
 
 
-def try_mecab_import() -> None:
+def try_mecab_import():
     try:
-        from natto import MeCab
+        import mecab_ko as MeCab
 
         return MeCab
     except ImportError:
         raise ImportError(
             'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
-            "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
-            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
-            "and [natto-py](https://github.com/buruzaemon/natto-py)"
+            "the python package `mecab-ko`: pip install mecab-ko"
         ) from None
 
 
+@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
+def create_natto_tokenizer():
+    def korean_natto_tokenizer_factory(nlp):
+        return KoreanNattoTokenizer(nlp.vocab)
+
+    return korean_natto_tokenizer_factory
+
+
+class KoreanNattoTokenizer(DummyTokenizer):
+    def __init__(self, vocab: Vocab):
+        self.vocab = vocab
+        self._mecab = self._try_mecab_import()  # type: ignore[func-returns-value]
+        self._mecab_tokenizer = None
+
+    @property
+    def mecab_tokenizer(self):
+        # This is a property so that initializing a pipeline with blank:ko is
+        # possible without actually requiring mecab-ko, e.g. to run
+        # `spacy init vectors ko` for a pipeline that will have a different
+        # tokenizer in the end. The languages need to match for the vectors
+        # to be imported and there's no way to pass a custom config to
+        # `init vectors`.
+        if self._mecab_tokenizer is None:
+            self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
+        return self._mecab_tokenizer
+
+    def __reduce__(self):
+        return KoreanNattoTokenizer, (self.vocab,)
+
+    def __call__(self, text: str) -> Doc:
+        dtokens = list(self.detailed_tokens(text))
+        surfaces = [dt["surface"] for dt in dtokens]
+        doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
+        for token, dtoken in zip(doc, dtokens):
+            first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
+            token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
+            if token.tag_ in TAG_MAP:
+                token.pos = TAG_MAP[token.tag_][POS]
+            else:
+                token.pos = X
+            token.lemma_ = dtoken["lemma"]
+        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
+        return doc
+
+    def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
+        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],      종성 유무(jongseong)[2], 읽기(reading)[3],
+        # 타입(type)[4], 첫번째 품사(start pos)[5],     마지막 품사(end pos)[6], 표현(expression)[7], *
+        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
+            if node.is_eos():
+                break
+            surface = node.surface
+            feature = node.feature
+            tag, _, expr = feature.partition(",")
+            lemma, _, remainder = expr.partition("/")
+            if lemma == "*" or lemma == "":
+                lemma = surface
+            yield {"surface": surface, "lemma": lemma, "tag": tag}
+
+    def score(self, examples):
+        validate_examples(examples, "KoreanTokenizer.score")
+        return Scorer.score_tokenization(examples)
+
+    def _try_mecab_import(self):
+        try:
+            from natto import MeCab
+
+            return MeCab
+        except ImportError:
+            raise ImportError(
+                'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires '
+                "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
+                "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
+                "and [natto-py](https://github.com/buruzaemon/natto-py)"
+            ) from None
+
+
 def check_spaces(text, tokens):
     prev_end = -1
     start = 0
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 7db986ab9e7..2a9f441c9b0 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -245,7 +245,7 @@ def hsb_tokenizer():
 
 @pytest.fixture(scope="session")
 def ko_tokenizer():
-    pytest.importorskip("natto")
+    pytest.importorskip("mecab_ko")
     return get_lang_class("ko")().tokenizer
 
 
@@ -267,6 +267,20 @@ def la_tokenizer():
     return get_lang_class("la")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def ko_tokenizer_natto():
+    pytest.importorskip("natto")
+    config = {
+        "nlp": {
+            "tokenizer": {
+                "@tokenizers": "spacy.KoreanNattoTokenizer.v1",
+            }
+        }
+    }
+    nlp = get_lang_class("ko").from_config(config)
+    return nlp.tokenizer
+
+
 @pytest.fixture(scope="session")
 def lb_tokenizer():
     return get_lang_class("lb")().tokenizer
diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py
index 7782ca4bcab..0c389b9ce52 100644
--- a/spacy/tests/lang/ko/test_lemmatization.py
+++ b/spacy/tests/lang/ko/test_lemmatization.py
@@ -7,3 +7,11 @@
 def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
     test_lemma = ko_tokenizer(word)[0].lemma_
     assert test_lemma == lemma
+
+
+@pytest.mark.parametrize(
+    "word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
+)
+def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
+    test_lemma = ko_tokenizer_natto(word)[0].lemma_
+    assert test_lemma == lemma
diff --git a/spacy/tests/lang/ko/test_serialize.py b/spacy/tests/lang/ko/test_serialize.py
index bba7bce6e05..eecc7d955ba 100644
--- a/spacy/tests/lang/ko/test_serialize.py
+++ b/spacy/tests/lang/ko/test_serialize.py
@@ -23,3 +23,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer):
     b = pickle.dumps(ko_tokenizer)
     ko_tokenizer_re = pickle.loads(b)
     assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
+
+
+def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
+    tokenizer_bytes = ko_tokenizer_natto.to_bytes()
+    nlp = Korean()
+    nlp.tokenizer.from_bytes(tokenizer_bytes)
+    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+    with make_tempdir() as d:
+        file_path = d / "tokenizer"
+        ko_tokenizer_natto.to_disk(file_path)
+        nlp = Korean()
+        nlp.tokenizer.from_disk(file_path)
+        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+
+def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
+    b = pickle.dumps(ko_tokenizer_natto)
+    ko_tokenizer_natto_re = pickle.loads(b)
+    assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()
diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py
index 6e06e405e0b..e7f8a5c0d79 100644
--- a/spacy/tests/lang/ko/test_tokenizer.py
+++ b/spacy/tests/lang/ko/test_tokenizer.py
@@ -19,6 +19,8 @@
               "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
 # fmt: on
 
+# tests for ko_tokenizer (default KoreanTokenizer)
+
 
 @pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
 def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
@@ -44,7 +46,7 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
     assert pos == expected_pos.split()
 
 
-def test_ko_empty_doc(ko_tokenizer):
+def test_ko_tokenizer_empty_doc(ko_tokenizer):
     tokens = ko_tokenizer("")
     assert len(tokens) == 0
 
@@ -55,6 +57,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer):
     assert tokens[1].pos_ == "X"
 
 
+# same tests for ko_tokenizer_natto (KoreanNattoTokenizer)
+
+
+@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
+def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens):
+    tokens = [token.text for token in ko_tokenizer_natto(text)]
+    assert tokens == expected_tokens.split()
+
+
+@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
+def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags):
+    tags = [token.tag_ for token in ko_tokenizer_natto(text)]
+    assert tags == expected_tags.split()
+
+
+@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
+def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags):
+    tags = ko_tokenizer_natto(text).user_data["full_tags"]
+    assert tags == expected_tags.split()
+
+
+@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
+def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos):
+    pos = [token.pos_ for token in ko_tokenizer_natto(text)]
+    assert pos == expected_pos.split()
+
+
+def test_ko_tokenizer_natto_empty_doc(ko_tokenizer_natto):
+    tokens = ko_tokenizer_natto("")
+    assert len(tokens) == 0
+
+
+@pytest.mark.issue(10535)
+def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto):
+    tokens = ko_tokenizer_natto("미닛 리피터")
+    assert tokens[1].pos_ == "X"
+
+
 # fmt: off
 SPACY_TOKENIZER_TESTS = [
     ("있다.", "있다 ."),
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index ff07c5b454a..704d4b90b44 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -25,7 +25,7 @@ def test_build_dependencies():
     libs_ignore_setup = [
         "numpy",
         "fugashi",
-        "natto-py",
+        "mecab-ko",
         "pythainlp",
         "sudachipy",
         "sudachidict_core",
diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx
index 7fed9f40765..9213dead16b 100644
--- a/website/docs/usage/models.mdx
+++ b/website/docs/usage/models.mdx
@@ -264,18 +264,49 @@ used for training the current [Japanese pipelines](/models/ja).
 
 ### Korean language support {id="korean"}
 
-> #### mecab-ko tokenizer
+There are currently three built-in options for Korean tokenization, two based on
+[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) and one
+using the rule-based tokenizer.
+
+> #### Default mecab-ko tokenizer
 >
 > ```python
+> # uses mecab-ko-dic
 > nlp = spacy.blank("ko")
+>
+> # with custom mecab args
+> mecab_args = "-d /path/to/dicdir -u /path/to/userdic"
+> config = {"nlp": {"tokenizer": {"mecab_args": mecab_args}}}
+> nlp = spacy.blank("ko", config=config)
 > ```
 
-The default MeCab-based Korean tokenizer requires:
+The default MeCab-based Korean tokenizer requires the python package
+[`mecab-ko`](https://pypi.org/project/mecab-ko/) and no further system
+requirements.
+
+The `natto-py` MeCab-based tokenizer (the previous default for spaCy v3.4 and
+earlier) is available as `spacy.KoreanNattoTokenizer.v1`. It requires:
 
 - [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
 - [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
 - [natto-py](https://github.com/buruzaemon/natto-py)
 
+To use this tokenizer, edit `[nlp.tokenizer]` in your config:
+
+> #### natto-py MeCab-ko tokenizer
+>
+> ```python
+> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.KoreanNattoTokenizer.v1"}}}
+> nlp = spacy.blank("ko", config=config)
+> ```
+
+```ini
+### config.cfg
+[nlp]
+lang = "ko"
+tokenizer = {"@tokenizers" = "spacy.KoreanNattoTokenizer.v1"}
+```
+
 For some Korean datasets and tasks, the
 [rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
 than MeCab. To configure a Korean pipeline with the rule-based tokenizer:

From 3b48b37c67b8ae5863da81a74146b8101923d207 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 29 Aug 2022 13:23:24 +0200
Subject: [PATCH 233/504] Remove setup_requires from setup.cfg (#11384)

* Remove setup_requires from setup.cfg

* Update requirements test to ignore cython in setup.cfg
---
 setup.cfg                                | 13 +------------
 spacy/tests/package/test_requirements.py |  2 +-
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index e27f9adeacc..a5df23cc4ea 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,18 +30,7 @@ project_urls =
 [options]
 zip_safe = false
 include_package_data = true
-python_requires = >=3.7
-# NOTE: This section is superseded by pyproject.toml and will be removed in
-# spaCy v4
-setup_requires =
-    cython>=0.25,<3.0
-    numpy>=1.15.0; python_version < "3.9"
-    numpy>=1.19.0; python_version >= "3.9"
-    # We also need our Cython packages here to compile against
-    cymem>=2.0.2,<2.1.0
-    preshed>=3.0.2,<3.1.0
-    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.2.2,<8.3.0
+python_requires = >=3.6
 install_requires =
     # Our libraries
     spacy-legacy>=3.0.11,<3.1.0
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index 704d4b90b44..a63b1d8b060 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -5,7 +5,7 @@
 def test_build_dependencies():
     # Check that library requirements are pinned exactly the same across different setup files.
     libs_ignore_requirements = [
-        "numpy",
+        "cython",
         "pytest",
         "pytest-timeout",
         "mock",

From 9d3ce438d80b9fd971fb31f60b9c7547c050e397 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 30 Aug 2022 13:56:35 +0200
Subject: [PATCH 234/504] Make stable private modules public and adjust names
 (#11353)

* Make stable private modules public and adjust names

* `spacy.ml._character_embed` -> `spacy.ml.character_embed`
* `spacy.ml._precomputable_affine` -> `spacy.ml.precomputable_affine`
* `spacy.tokens._serialize` -> `spacy.tokens.doc_bin`
* `spacy.tokens._retokenize` -> `spacy.tokens.retokenize`
* `spacy.tokens._dict_proxies` -> `spacy.tokens.span_groups`

* Skip _precomputable_affine

* retokenize -> retokenizer

* Fix imports
---
 setup.py                                           |  2 +-
 .../ml/{_character_embed.py => character_embed.py} |  0
 spacy/ml/models/tok2vec.py                         |  6 ++++--
 spacy/pipeline/attribute_ruler.py                  |  4 ++--
 spacy/tests/pipeline/test_models.py                |  2 +-
 spacy/tests/pipeline/test_spancat.py               |  2 +-
 .../tests/serialize/test_serialize_span_groups.py  |  2 +-
 spacy/tokens/__init__.py                           |  3 ++-
 spacy/tokens/doc.pyi                               |  5 ++++-
 spacy/tokens/doc.pyx                               | 14 ++++++++++++++
 spacy/tokens/{_serialize.py => doc_bin.py}         | 11 ++++++-----
 spacy/tokens/{_retokenize.pyi => retokenizer.pyi}  |  0
 spacy/tokens/{_retokenize.pyx => retokenizer.pyx}  |  0
 spacy/tokens/{_dict_proxies.py => span_groups.py}  |  0
 14 files changed, 36 insertions(+), 15 deletions(-)
 rename spacy/ml/{_character_embed.py => character_embed.py} (100%)
 rename spacy/tokens/{_serialize.py => doc_bin.py} (97%)
 rename spacy/tokens/{_retokenize.pyi => retokenizer.pyi} (100%)
 rename spacy/tokens/{_retokenize.pyx => retokenizer.pyx} (100%)
 rename spacy/tokens/{_dict_proxies.py => span_groups.py} (100%)

diff --git a/setup.py b/setup.py
index 33178662df4..c9b4f7171e3 100755
--- a/setup.py
+++ b/setup.py
@@ -61,7 +61,7 @@
     "spacy.tokens.span_group",
     "spacy.tokens.graph",
     "spacy.tokens.morphanalysis",
-    "spacy.tokens._retokenize",
+    "spacy.tokens.retokenizer",
     "spacy.matcher.matcher",
     "spacy.matcher.phrasematcher",
     "spacy.matcher.dependencymatcher",
diff --git a/spacy/ml/_character_embed.py b/spacy/ml/character_embed.py
similarity index 100%
rename from spacy/ml/_character_embed.py
rename to spacy/ml/character_embed.py
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 0edc8999114..a605d32cd40 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -21,7 +21,9 @@
 
 from ...attrs import intify_attr
 from ...errors import Errors
-from ...ml import _character_embed
+from ...ml import character_embed
+from ..staticvectors import StaticVectors
+from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
 from ...tokens import Doc
 from ...util import registry
@@ -241,7 +243,7 @@ def CharacterEmbed(
     if feature is None:
         raise ValueError(Errors.E911.format(feat=feature))
     char_embed = chain(
-        _character_embed.CharacterEmbed(nM=nM, nC=nC),
+        character_embed.CharacterEmbed(nM=nM, nC=nC),
         cast(Model[List[Floats2d], Ragged], list2ragged()),
     )
     feature_extractor: Model[List[Doc], Ragged] = chain(
diff --git a/spacy/pipeline/attribute_ruler.py b/spacy/pipeline/attribute_ruler.py
index 8ac74d92bcd..126a48945bc 100644
--- a/spacy/pipeline/attribute_ruler.py
+++ b/spacy/pipeline/attribute_ruler.py
@@ -10,8 +10,8 @@
 from ..scorer import Scorer
 from ..symbols import IDS
 from ..tokens import Doc, Span
-from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
-from ..training import Example
+from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs
+from ..vocab import Vocab
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
 from .pipe import Pipe
diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py
index fef0017a8e1..4c0d352aa7f 100644
--- a/spacy/tests/pipeline/test_models.py
+++ b/spacy/tests/pipeline/test_models.py
@@ -8,7 +8,7 @@
 
 from spacy.lang.en import English
 from spacy.ml import FeatureExtractor, StaticVectors
-from spacy.ml._character_embed import CharacterEmbed
+from spacy.ml.character_embed import CharacterEmbed
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 9405a78e040..c143d193fa6 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -7,7 +7,7 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.tokens import SpanGroup
-from spacy.tokens._dict_proxies import SpanGroups
+from spacy.tokens.span_groups import SpanGroups
 from spacy.training import Example
 from spacy.util import fix_random_seed, make_tempdir, registry
 
diff --git a/spacy/tests/serialize/test_serialize_span_groups.py b/spacy/tests/serialize/test_serialize_span_groups.py
index 85313fcdcc3..c1c910fa137 100644
--- a/spacy/tests/serialize/test_serialize_span_groups.py
+++ b/spacy/tests/serialize/test_serialize_span_groups.py
@@ -1,7 +1,7 @@
 import pytest
 
 from spacy.tokens import Span, SpanGroup
-from spacy.tokens._dict_proxies import SpanGroups
+from spacy.tokens.span_groups import SpanGroups
 
 
 @pytest.mark.issue(10685)
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index 3393ca6eca9..e5a244360e3 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -3,6 +3,7 @@
 from .morphanalysis import MorphAnalysis
 from .span import Span
 from .span_group import SpanGroup
-from .token import Token
+from .doc_bin import DocBin
+from .morphanalysis import MorphAnalysis
 
 __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index f0b68862c32..0fae118b4b6 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -17,7 +17,10 @@ from typing import (
 import numpy as np
 from cymem.cymem import Pool
 from thinc.types import Floats1d, Floats2d, Ints2d
-
+from .span import Span
+from .token import Token
+from .span_groups import SpanGroups
+from .retokenizer import Retokenizer
 from ..lexeme import Lexeme
 from ..vocab import Vocab
 from ._dict_proxies import SpanGroups
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 7384e44cd20..f9a706f4a98 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -20,6 +20,13 @@ from thinc.util import copy_array
 
 from .span cimport Span
 from .token cimport MISSING_DEP
+from .span_groups import SpanGroups
+from .token cimport Token
+from ..lexeme cimport Lexeme, EMPTY_LEXEME
+from ..typedefs cimport attr_t, flags_t
+from ..attrs cimport attr_id_t
+from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
+from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 
 from ._dict_proxies import SpanGroups
 
@@ -50,6 +57,13 @@ from .. import parts_of_speech, schemas, util
 from ..attrs import IDS, intify_attr
 from ..compat import copy_reg
 from ..errors import Errors, Warnings
+from ..morphology import Morphology
+from .. import util
+from .. import parts_of_speech
+from .. import schemas
+from .underscore import Underscore, get_ext_args
+from .retokenizer import Retokenizer
+from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from ..util import get_words_and_spaces
 from ._retokenize import Retokenizer
 from .underscore import Underscore, get_ext_args
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/doc_bin.py
similarity index 97%
rename from spacy/tokens/_serialize.py
rename to spacy/tokens/doc_bin.py
index 873d85835f0..8a08864d46e 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/doc_bin.py
@@ -10,11 +10,12 @@
 from ..attrs import IDS, ORTH, SPACY, intify_attr
 from ..compat import copy_reg
 from ..errors import Errors
-from ..util import SimpleFrozenList, ensure_path
-from ..vocab import Vocab
-from ._dict_proxies import SpanGroups
-from .doc import DOCBIN_ALL_ATTRS as ALL_ATTRS
-from .doc import Doc
+from ..util import ensure_path, SimpleFrozenList
+from .span_groups import SpanGroups
+
+# fmt: off
+ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
+# fmt: on
 
 
 class DocBin:
diff --git a/spacy/tokens/_retokenize.pyi b/spacy/tokens/retokenizer.pyi
similarity index 100%
rename from spacy/tokens/_retokenize.pyi
rename to spacy/tokens/retokenizer.pyi
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/retokenizer.pyx
similarity index 100%
rename from spacy/tokens/_retokenize.pyx
rename to spacy/tokens/retokenizer.pyx
diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/span_groups.py
similarity index 100%
rename from spacy/tokens/_dict_proxies.py
rename to spacy/tokens/span_groups.py

From c6f7a88539d00820ca59fb1ed3d813324b8308ae Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 30 Aug 2022 22:40:31 +0900
Subject: [PATCH 235/504] Update/remove old Matcher syntax (#11370)

* Clean up old Matcher call style related stuff

In v2 Matcher.add was called with (key, on_match, *patterns). In v3 this
was changed to (key, patterns, *, on_match=None), but there were various
points where the old call syntax was documented or handled specially.
This removes all those.

The Matcher itself didn't need any code changes, as it just gives a
generic type error. However the PhraseMatcher required some changes
because it would automatically "fix" the old call style.

Surprisingly, the tokenizer was still using the old call style in one
place.

After these changes tests failed in two places:

1. one test for the "new" call style, including the "old" call style. I
   removed this test.
2. deserializing the PhraseMatcher fails because the input docs are a
   set.

I am not sure why 2 is happening - I guess it's a quirk of the
serialization format? - so for now I just convert the set to a list when
deserializing. The check that the input Docs are a List in the
PhraseMatcher is a new check, but makes it parallel with the other
Matchers, which seemed like the right thing to do.

* Add notes related to input docs / deserialization type

* Remove Typing import

* Remove old note about call style change

* Apply suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Use separate method for setting internal doc representations

In addition to the title change, this changes the internal dict to be a
defaultdict, instead of a dict with frequent use of setdefault.

* Add _add_from_arrays for unpickling

* Cleanup around adding from arrays

This moves adding to internal structures into the private batch method,
and removes the single-add method.

This has one behavioral change for `add`, in that if something is wrong
with the list of input Docs (such as one of the items not being a Doc),
valid items before the invalid one will not be added. Also the callback
will not be updated if anything is invalid. This change should not be
significant.

This also adds a test to check failure when given a non-Doc.

* Update spacy/matcher/phrasematcher.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/errors.py                            |   7 +-
 spacy/matcher/dependencymatcher.pyx        |   6 +-
 spacy/matcher/matcher.pyx                  |   6 +-
 spacy/matcher/phrasematcher.pyi            |   9 ++
 spacy/matcher/phrasematcher.pyx            | 118 ++++++++++++---------
 spacy/tests/matcher/test_phrase_matcher.py |  29 ++---
 spacy/tokenizer.pyx                        |   2 +-
 website/docs/api/matcher.mdx               |  14 ---
 website/docs/api/phrasematcher.mdx         |  22 +---
 9 files changed, 97 insertions(+), 116 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index cf9a7b7087a..146c60b6d60 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -500,7 +500,7 @@ class Errors(metaclass=ErrorsWithCodes):
             "Current DocBin: {current}\nOther DocBin: {other}")
     E169 = ("Can't find module: {module}")
     E170 = ("Cannot apply transition {name}: invalid for the current state.")
-    E171 = ("Matcher.add received invalid 'on_match' callback argument: expected "
+    E171 = ("{name}.add received invalid 'on_match' callback argument: expected "
             "callable or None, but got: {arg_type}")
     E175 = ("Can't remove rule for unknown match pattern ID: {key}")
     E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
@@ -759,7 +759,7 @@ class Errors(metaclass=ErrorsWithCodes):
             "loaded nlp object, but got: {source}")
     E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
             "a string value from {expected} but got: '{arg}'")
-    E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
+    E948 = ("`{name}.add` received invalid 'patterns' argument: expected "
             "a list, but got: {arg_type}")
     E949 = ("Unable to align tokens for the predicted and reference docs. It "
             "is only possible to align the docs when both texts are the same "
@@ -989,6 +989,9 @@ class Errors(metaclass=ErrorsWithCodes):
              "reduction. Please enable one of `use_reduce_first`, "
              "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
+    # v4 error strings
+    E4000 = ("Expected a Doc as input, but got: '{type}'")
+
 
 # Deprecated model shortcuts, only used in errors and warnings
 OLD_MODEL_SHORTCUTS = {
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index ab5f5d5d14b..0b639ab04fb 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -175,9 +175,9 @@ cdef class DependencyMatcher:
         on_match (callable): Optional callback executed on match.
         """
         if on_match is not None and not hasattr(on_match, "__call__"):
-            raise ValueError(Errors.E171.format(arg_type=type(on_match)))
-        if patterns is None or not isinstance(patterns, List):  # old API
-            raise ValueError(Errors.E948.format(arg_type=type(patterns)))
+            raise ValueError(Errors.E171.format(name="DependencyMatcher", arg_type=type(on_match)))
+        if patterns is None or not isinstance(patterns, List):
+            raise ValueError(Errors.E948.format(name="DependencyMatcher", arg_type=type(patterns)))
         for pattern in patterns:
             if len(pattern) == 0:
                 raise ValueError(Errors.E012.format(key=key))
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index f0116169a6b..715dd45f07c 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -113,9 +113,9 @@ cdef class Matcher:
         """
         errors = {}
         if on_match is not None and not hasattr(on_match, "__call__"):
-            raise ValueError(Errors.E171.format(arg_type=type(on_match)))
-        if patterns is None or not isinstance(patterns, List):  # old API
-            raise ValueError(Errors.E948.format(arg_type=type(patterns)))
+            raise ValueError(Errors.E171.format(name="Matcher", arg_type=type(on_match)))
+        if patterns is None or not isinstance(patterns, List):
+            raise ValueError(Errors.E948.format(name="Matcher", arg_type=type(patterns)))
         if greedy is not None and greedy not in ["FIRST", "LONGEST"]:
             raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=greedy))
         for i, pattern in enumerate(patterns):
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index 27f6ba373fc..f9585da7893 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -21,6 +21,15 @@ class PhraseMatcher:
             Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
         ] = ...,
     ) -> None: ...
+    def _add_from_arrays(
+        self,
+        key: str,
+        specs: List[List[int]],
+        *,
+        on_match: Optional[
+            Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
+        ] = ...,
+    ) -> None: ...
     def remove(self, key: str) -> None: ...
     @overload
     def __call__(
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 4efcdb05c43..6e3c52924fa 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,5 +1,8 @@
-# cython: infer_types=True
-from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
+# cython: infer_types=True, profile=True
+from typing import List
+from collections import defaultdict
+from libc.stdint cimport uintptr_t
+from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
 
 import warnings
 
@@ -39,7 +42,7 @@ cdef class PhraseMatcher:
         """
         self.vocab = vocab
         self._callbacks = {}
-        self._docs = {}
+        self._docs = defaultdict(set)
         self._validate = validate
 
         self.mem = Pool()
@@ -155,66 +158,24 @@ cdef class PhraseMatcher:
         del self._callbacks[key]
         del self._docs[key]
 
-    def add(self, key, docs, *_docs, on_match=None):
-        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
-        key, an on_match callback, and one or more patterns.
 
-        Since spaCy v2.2.2, PhraseMatcher.add takes a list of patterns as the
-        second argument, with the on_match callback as an optional keyword
-        argument.
+    def _add_from_arrays(self, key, specs, *, on_match=None):
+        """Add a preprocessed list of specs, with an optional callback.
 
         key (str): The match ID.
-        docs (list): List of `Doc` objects representing match patterns.
+        specs (List[List[int]]): A list of lists of hashes to match.
         on_match (callable): Callback executed on match.
-        *_docs (Doc): For backwards compatibility: list of patterns to add
-            as variable arguments. Will be ignored if a list of patterns is
-            provided as the second argument.
-
-        DOCS: https://spacy.io/api/phrasematcher#add
         """
-        if docs is None or hasattr(docs, "__call__"):  # old API
-            on_match = docs
-            docs = _docs
-
-        _ = self.vocab[key]
-        self._callbacks[key] = on_match
-        self._docs.setdefault(key, set())
-
         cdef MapStruct* current_node
         cdef MapStruct* internal_node
         cdef void* result
 
-        if isinstance(docs, Doc):
-            raise ValueError(Errors.E179.format(key=key))
-        for doc in docs:
-            if len(doc) == 0:
-                continue
-            if isinstance(doc, Doc):
-                attrs = (TAG, POS, MORPH, LEMMA, DEP)
-                has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
-                for attr in attrs:
-                    if self.attr == attr and not has_annotation[attr]:
-                        if attr == TAG:
-                            pipe = "tagger"
-                        elif attr in (POS, MORPH):
-                            pipe = "morphologizer or tagger+attribute_ruler"
-                        elif attr == LEMMA:
-                            pipe = "lemmatizer"
-                        elif attr == DEP:
-                            pipe = "parser"
-                        error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
-                        raise ValueError(error_msg)
-                if self._validate and any(has_annotation.values()) \
-                        and self.attr not in attrs:
-                    string_attr = self.vocab.strings[self.attr]
-                    warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
-                keyword = self._convert_to_array(doc)
-            else:
-                keyword = doc
-            self._docs[key].add(tuple(keyword))
+        self._callbacks[key] = on_match
+        for spec in specs:
+            self._docs[key].add(tuple(spec))
 
             current_node = self.c_map
-            for token in keyword:
+            for token in spec:
                 if token == self._terminal_hash:
                     warnings.warn(Warnings.W021)
                     break
@@ -233,6 +194,57 @@ cdef class PhraseMatcher:
                 result = internal_node
             map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
 
+
+    def add(self, key, docs, *, on_match=None):
+        """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
+        key, a list of one or more patterns, and (optionally) an on_match callback.
+
+        key (str): The match ID.
+        docs (list): List of `Doc` objects representing match patterns.
+        on_match (callable): Callback executed on match.
+
+        If any of the input Docs are invalid, no internal state will be updated.
+
+        DOCS: https://spacy.io/api/phrasematcher#add
+        """
+        if isinstance(docs, Doc):
+            raise ValueError(Errors.E179.format(key=key))
+        if docs is None or not isinstance(docs, List):
+            raise ValueError(Errors.E948.format(name="PhraseMatcher", arg_type=type(docs)))
+        if on_match is not None and not hasattr(on_match, "__call__"):
+            raise ValueError(Errors.E171.format(name="PhraseMatcher", arg_type=type(on_match)))
+
+        _ = self.vocab[key]
+        specs = []
+
+        for doc in docs:
+            if len(doc) == 0:
+                continue
+            if not isinstance(doc, Doc):
+                raise ValueError(Errors.E4000.format(type=type(doc)))
+
+            attrs = (TAG, POS, MORPH, LEMMA, DEP)
+            has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
+            for attr in attrs:
+                if self.attr == attr and not has_annotation[attr]:
+                    if attr == TAG:
+                        pipe = "tagger"
+                    elif attr in (POS, MORPH):
+                        pipe = "morphologizer or tagger+attribute_ruler"
+                    elif attr == LEMMA:
+                        pipe = "lemmatizer"
+                    elif attr == DEP:
+                        pipe = "parser"
+                    error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+                    raise ValueError(error_msg)
+            if self._validate and any(has_annotation.values()) \
+                    and self.attr not in attrs:
+                string_attr = self.vocab.strings[self.attr]
+                warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
+            specs.append(self._convert_to_array(doc))
+
+        self._add_from_arrays(key, specs, on_match=on_match)
+
     def __call__(self, object doclike, *, as_spans=False):
         """Find all sequences matching the supplied patterns on the `Doc`.
 
@@ -345,7 +357,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr):
     matcher = PhraseMatcher(vocab, attr=attr)
     for key, specs in docs.items():
         callback = callbacks.get(key, None)
-        matcher.add(key, specs, on_match=callback)
+        matcher._add_from_arrays(key, specs, on_match=callback)
     return matcher
 
 
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 7335bbdf107..4ad234cba3b 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -198,28 +198,6 @@ def test_phrase_matcher_contains(en_vocab):
     assert "TEST2" not in matcher
 
 
-def test_phrase_matcher_add_new_api(en_vocab):
-    doc = Doc(en_vocab, words=["a", "b"])
-    patterns = [Doc(en_vocab, words=["a"]), Doc(en_vocab, words=["a", "b"])]
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("OLD_API", None, *patterns)
-    assert len(matcher(doc)) == 2
-    matcher = PhraseMatcher(en_vocab)
-    on_match = Mock()
-    matcher.add("OLD_API_CALLBACK", on_match, *patterns)
-    assert len(matcher(doc)) == 2
-    assert on_match.call_count == 2
-    # New API: add(key: str, patterns: List[List[dict]], on_match: Callable)
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("NEW_API", patterns)
-    assert len(matcher(doc)) == 2
-    matcher = PhraseMatcher(en_vocab)
-    on_match = Mock()
-    matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match)
-    assert len(matcher(doc)) == 2
-    assert on_match.call_count == 2
-
-
 def test_phrase_matcher_repeated_add(en_vocab):
     matcher = PhraseMatcher(en_vocab)
     # match ID only gets added once
@@ -468,6 +446,13 @@ def test_phrase_matcher_deprecated(en_vocab):
         assert "spaCy v3.0" in str(record.list[0].message)
 
 
+def test_phrase_matcher_non_doc(en_vocab):
+    matcher = PhraseMatcher(en_vocab)
+    doc = Doc(en_vocab, words=["hello", "world"])
+    with pytest.raises(ValueError):
+        matcher.add("TEST", [doc, "junk"])
+
+
 @pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"])
 def test_phrase_matcher_sent_start(en_vocab, attr):
     _ = PhraseMatcher(en_vocab, attr=attr)  # noqa: F841
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 407ca6ca6de..7c81d936314 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -636,7 +636,7 @@ cdef class Tokenizer:
         self._rules[string] = substrings
         self._flush_cache()
         if not self.faster_heuristics or self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
-            self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
+            self._special_matcher.add(string, [self._tokenize_affixes(string, False)])
 
     def _reload_special_cases(self):
         self._flush_cache()
diff --git a/website/docs/api/matcher.mdx b/website/docs/api/matcher.mdx
index c66579da814..66954b6c4fb 100644
--- a/website/docs/api/matcher.mdx
+++ b/website/docs/api/matcher.mdx
@@ -211,20 +211,6 @@ will be overwritten.
 > matches = matcher(doc)
 > ```
 
-<Infobox title="Changed in v3.0" variant="warning">
-
-As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument
-(instead of a variable number of arguments). The `on_match` callback becomes an
-optional keyword argument.
-
-```diff
-patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
-- matcher.add("GoogleNow", on_match, *patterns)
-+ matcher.add("GoogleNow", patterns, on_match=on_match)
-```
-
-</Infobox>
-
 | Name                                | Description                                                                                                                                                |
 | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `match_id`                          | An ID for the thing you're matching. ~~str~~                                                                                                               |
diff --git a/website/docs/api/phrasematcher.mdx b/website/docs/api/phrasematcher.mdx
index 14ccefb772e..2c5e767dcba 100644
--- a/website/docs/api/phrasematcher.mdx
+++ b/website/docs/api/phrasematcher.mdx
@@ -116,10 +116,10 @@ Check whether the matcher contains rules for a match ID.
 ## PhraseMatcher.add {id="add",tag="method"}
 
 Add a rule to the matcher, consisting of an ID key, one or more patterns, and a
-callback function to act on the matches. The callback function will receive the
-arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for
-the given ID, the patterns will be extended. An `on_match` callback will be
-overwritten.
+optional callback function to act on the matches. The callback function will
+receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
+exists for the given ID, the patterns will be extended. An `on_match` callback
+will be overwritten.
 
 > #### Example
 >
@@ -134,20 +134,6 @@ overwritten.
 >   matches = matcher(doc)
 > ```
 
-<Infobox title="Changed in v3.0" variant="warning">
-
-As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second
-argument (instead of a variable number of arguments). The `on_match` callback
-becomes an optional keyword argument.
-
-```diff
-patterns = [nlp("health care reform"), nlp("healthcare reform")]
-- matcher.add("HEALTH", on_match, *patterns)
-+ matcher.add("HEALTH", patterns, on_match=on_match)
-```
-
-</Infobox>
-
 | Name           | Description                                                                                                                                                |
 | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `key`          | An ID for the thing you're matching. ~~str~~                                                                                                               |

From 401e78936d293c6b9d366acb0f5f79553edcf48a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 2 Sep 2022 09:08:40 +0200
Subject: [PATCH 236/504] Consolidate and freeze symbols (#11352)

* Consolidate and freeze symbols

Instead of having symbol values defined in three potentially conflicting
places (`spacy.attrs`, `spacy.parts_of_speech`, `spacy.symbols`), define
all symbols in `spacy.symbols` and reference those values in
`spacy.attrs` and `spacy.parts_of_speech`.

Remove deprecated and placeholder symbols from `spacy.attrs.IDS`.

Make `spacy.attrs.NAMES` and `spacy.symbols.NAMES` reverse dicts rather
than lists in order to support future use of hash values in `attr_id_t`.

Minor changes:

* Use `uint64_t` for attrs in `Doc.to_array` to support future use of
hash values
* Remove unneeded attrs filter for error message in `Doc.to_array`
* Remove unused attr `SENT_END`

* Handle dynamic size of attr_id_t in Doc.to_array

* Undo added warnings

* Refactor to make Doc.to_array more similar to Doc.from_array

* Improve refactoring
---
 spacy/attrs.pxd             | 129 +++-------
 spacy/attrs.pyx             |  49 +---
 spacy/parts_of_speech.pxd   |  38 +--
 spacy/schemas.py            |   2 +-
 spacy/strings.pyx           |   4 +-
 spacy/symbols.pxd           |  15 +-
 spacy/symbols.pyx           |   6 +-
 spacy/tests/test_symbols.py | 467 ++++++++++++++++++++++++++++++++++++
 spacy/tokens/doc.pyx        |  20 +-
 9 files changed, 551 insertions(+), 179 deletions(-)
 create mode 100644 spacy/tests/test_symbols.py

diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index fbbac0ec29c..b8972cb714e 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -1,99 +1,50 @@
-# Reserve 64 values for flag features
 from . cimport symbols
 
 
 cdef enum attr_id_t:
-    NULL_ATTR
-    IS_ALPHA
-    IS_ASCII
-    IS_DIGIT
-    IS_LOWER
-    IS_PUNCT
-    IS_SPACE
-    IS_TITLE
-    IS_UPPER
-    LIKE_URL
-    LIKE_NUM
-    LIKE_EMAIL
-    IS_STOP
-    IS_OOV_DEPRECATED
-    IS_BRACKET
-    IS_QUOTE
-    IS_LEFT_PUNCT
-    IS_RIGHT_PUNCT
-    IS_CURRENCY
+    NULL_ATTR = 0
+    IS_ALPHA = symbols.IS_ALPHA
+    IS_ASCII = symbols.IS_ASCII
+    IS_DIGIT = symbols.IS_DIGIT
+    IS_LOWER = symbols.IS_LOWER
+    IS_PUNCT = symbols.IS_PUNCT
+    IS_SPACE = symbols.IS_SPACE
+    IS_TITLE = symbols.IS_TITLE
+    IS_UPPER = symbols.IS_UPPER
+    LIKE_URL = symbols.LIKE_URL
+    LIKE_NUM = symbols.LIKE_NUM
+    LIKE_EMAIL = symbols.LIKE_EMAIL
+    IS_STOP = symbols.IS_STOP
+    IS_BRACKET = symbols.IS_BRACKET
+    IS_QUOTE = symbols.IS_QUOTE
+    IS_LEFT_PUNCT = symbols.IS_LEFT_PUNCT
+    IS_RIGHT_PUNCT = symbols.IS_RIGHT_PUNCT
+    IS_CURRENCY = symbols.IS_CURRENCY
 
-    FLAG19 = 19
-    FLAG20
-    FLAG21
-    FLAG22
-    FLAG23
-    FLAG24
-    FLAG25
-    FLAG26
-    FLAG27
-    FLAG28
-    FLAG29
-    FLAG30
-    FLAG31
-    FLAG32
-    FLAG33
-    FLAG34
-    FLAG35
-    FLAG36
-    FLAG37
-    FLAG38
-    FLAG39
-    FLAG40
-    FLAG41
-    FLAG42
-    FLAG43
-    FLAG44
-    FLAG45
-    FLAG46
-    FLAG47
-    FLAG48
-    FLAG49
-    FLAG50
-    FLAG51
-    FLAG52
-    FLAG53
-    FLAG54
-    FLAG55
-    FLAG56
-    FLAG57
-    FLAG58
-    FLAG59
-    FLAG60
-    FLAG61
-    FLAG62
-    FLAG63
+    ID = symbols.ID
+    ORTH = symbols.ORTH
+    LOWER = symbols.LOWER
+    NORM = symbols.NORM
+    SHAPE = symbols.SHAPE
+    PREFIX = symbols.PREFIX
+    SUFFIX = symbols.SUFFIX
 
-    ID
-    ORTH
-    LOWER
-    NORM
-    SHAPE
-    PREFIX
-    SUFFIX
+    LENGTH = symbols.LENGTH
+    CLUSTER = symbols.CLUSTER
+    LEMMA = symbols.LEMMA
+    POS = symbols.POS
+    TAG = symbols.TAG
+    DEP = symbols.DEP
+    ENT_IOB = symbols.ENT_IOB
+    ENT_TYPE = symbols.ENT_TYPE
+    HEAD = symbols.HEAD
+    SENT_START = symbols.SENT_START
+    SPACY = symbols.SPACY
+    PROB = symbols.PROB
 
-    LENGTH
-    CLUSTER
-    LEMMA
-    POS
-    TAG
-    DEP
-    ENT_IOB
-    ENT_TYPE
-    HEAD
-    SENT_START
-    SPACY
-    PROB
-
-    LANG
+    LANG = symbols.LANG
     ENT_KB_ID = symbols.ENT_KB_ID
-    MORPH
+    MORPH = symbols.MORPH
     ENT_ID = symbols.ENT_ID
 
-    IDX
-    SENT_END
+    IDX = symbols.IDX
diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 0a4aecc5d85..1688afe47af 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -17,57 +17,11 @@ IDS = {
     "LIKE_NUM": LIKE_NUM,
     "LIKE_EMAIL": LIKE_EMAIL,
     "IS_STOP": IS_STOP,
-    "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
     "IS_BRACKET": IS_BRACKET,
     "IS_QUOTE": IS_QUOTE,
     "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
     "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
     "IS_CURRENCY": IS_CURRENCY,
-    "FLAG19": FLAG19,
-    "FLAG20": FLAG20,
-    "FLAG21": FLAG21,
-    "FLAG22": FLAG22,
-    "FLAG23": FLAG23,
-    "FLAG24": FLAG24,
-    "FLAG25": FLAG25,
-    "FLAG26": FLAG26,
-    "FLAG27": FLAG27,
-    "FLAG28": FLAG28,
-    "FLAG29": FLAG29,
-    "FLAG30": FLAG30,
-    "FLAG31": FLAG31,
-    "FLAG32": FLAG32,
-    "FLAG33": FLAG33,
-    "FLAG34": FLAG34,
-    "FLAG35": FLAG35,
-    "FLAG36": FLAG36,
-    "FLAG37": FLAG37,
-    "FLAG38": FLAG38,
-    "FLAG39": FLAG39,
-    "FLAG40": FLAG40,
-    "FLAG41": FLAG41,
-    "FLAG42": FLAG42,
-    "FLAG43": FLAG43,
-    "FLAG44": FLAG44,
-    "FLAG45": FLAG45,
-    "FLAG46": FLAG46,
-    "FLAG47": FLAG47,
-    "FLAG48": FLAG48,
-    "FLAG49": FLAG49,
-    "FLAG50": FLAG50,
-    "FLAG51": FLAG51,
-    "FLAG52": FLAG52,
-    "FLAG53": FLAG53,
-    "FLAG54": FLAG54,
-    "FLAG55": FLAG55,
-    "FLAG56": FLAG56,
-    "FLAG57": FLAG57,
-    "FLAG58": FLAG58,
-    "FLAG59": FLAG59,
-    "FLAG60": FLAG60,
-    "FLAG61": FLAG61,
-    "FLAG62": FLAG62,
-    "FLAG63": FLAG63,
     "ID": ID,
     "ORTH": ORTH,
     "LOWER": LOWER,
@@ -93,8 +47,7 @@ IDS = {
 }
 
 
-# ATTR IDs, in order of the symbol
-NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
+NAMES = {v: k for k, v in IDS.items()}
 locals().update(IDS)
 
 
diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index b5423d11301..01f116ea688 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -4,22 +4,22 @@ from . cimport symbols
 cpdef enum univ_pos_t:
     NO_TAG = 0
     ADJ = symbols.ADJ
-    ADP
-    ADV
-    AUX
-    CONJ
-    CCONJ  # U20
-    DET
-    INTJ
-    NOUN
-    NUM
-    PART
-    PRON
-    PROPN
-    PUNCT
-    SCONJ
-    SYM
-    VERB
-    X
-    EOL
-    SPACE
+    ADP = symbols.ADP
+    ADV = symbols.ADV
+    AUX = symbols.AUX
+    CONJ = symbols.CONJ
+    CCONJ  = symbols.CCONJ  # U20
+    DET = symbols.DET
+    INTJ = symbols.INTJ
+    NOUN = symbols.NOUN
+    NUM = symbols.NUM
+    PART = symbols.PART
+    PRON = symbols.PRON
+    PROPN = symbols.PROPN
+    PUNCT = symbols.PUNCT
+    SCONJ = symbols.SCONJ
+    SYM = symbols.SYM
+    VERB = symbols.VERB
+    X = symbols.X
+    EOL = symbols.EOL
+    SPACE = symbols.SPACE
diff --git a/spacy/schemas.py b/spacy/schemas.py
index fa987b90f19..9a2b5ed60e9 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -181,7 +181,7 @@ def validate_init_settings(
 
 def validate_token_pattern(obj: list) -> List[str]:
     # Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"})
-    get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k
+    get_key = lambda k: NAMES[k] if isinstance(k, int) and k in NAMES else k
     if isinstance(obj, list):
         converted = []
         for pattern in obj:
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 376a131751e..e73b66dff54 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -148,7 +148,7 @@ cdef class StringStore:
         elif _try_coerce_to_hash(string_or_id, &str_hash):
             if str_hash == 0:
                 return ""
-            elif str_hash < len(SYMBOLS_BY_INT):
+            elif str_hash in SYMBOLS_BY_INT:
                 return SYMBOLS_BY_INT[str_hash]
             else:
                 utf8str = <Utf8Str*>self._map.get(str_hash)
@@ -224,7 +224,7 @@ cdef class StringStore:
             # TODO: Raise an error instead
             return self._map.get(string_or_id) is not NULL
 
-        if str_hash < len(SYMBOLS_BY_INT):
+        if str_hash in SYMBOLS_BY_INT:
             return True
         else:
             return self._map.get(str_hash) is not NULL
diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index 73be19145b2..9e74bf67620 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -1,5 +1,6 @@
+# DO NOT EDIT! The symbols are frozen as of spaCy v3.0.0.
 cdef enum symbol_t:
-    NIL
+    NIL = 0
     IS_ALPHA
     IS_ASCII
     IS_DIGIT
@@ -65,7 +66,7 @@ cdef enum symbol_t:
     FLAG62
     FLAG63
 
-    ID
+    ID = 64
     ORTH
     LOWER
     NORM
@@ -385,7 +386,7 @@ cdef enum symbol_t:
     DEPRECATED275
     DEPRECATED276
 
-    PERSON
+    PERSON = 380
     NORP
     FACILITY
     ORG
@@ -405,7 +406,7 @@ cdef enum symbol_t:
     ORDINAL
     CARDINAL
 
-    acomp
+    acomp = 398
     advcl
     advmod
     agent
@@ -458,12 +459,12 @@ cdef enum symbol_t:
     rcmod
     root
     xcomp
-
     acl
 
-    ENT_KB_ID
+    ENT_KB_ID = 452
     MORPH
     ENT_ID
 
     IDX
-    _
+    _ = 456
+    # DO NOT ADD ANY NEW SYMBOLS!
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index f7713577bd3..d2a8a428954 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -470,11 +470,7 @@ IDS = {
 }
 
 
-def sort_nums(x):
-    return x[1]
-
-
-NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
+NAMES = {v: k for k, v in IDS.items()}
 # Unfortunate hack here, to work around problem with long cpdef enum
 # (which is generating an enormous amount of C++ in Cython 0.24+)
 # We keep the enum cdef, and just make sure the names are available to Python
diff --git a/spacy/tests/test_symbols.py b/spacy/tests/test_symbols.py
new file mode 100644
index 00000000000..fb034accac2
--- /dev/null
+++ b/spacy/tests/test_symbols.py
@@ -0,0 +1,467 @@
+import pytest
+from spacy.symbols import IDS, NAMES
+
+V3_SYMBOLS = {
+    "": 0,
+    "IS_ALPHA": 1,
+    "IS_ASCII": 2,
+    "IS_DIGIT": 3,
+    "IS_LOWER": 4,
+    "IS_PUNCT": 5,
+    "IS_SPACE": 6,
+    "IS_TITLE": 7,
+    "IS_UPPER": 8,
+    "LIKE_URL": 9,
+    "LIKE_NUM": 10,
+    "LIKE_EMAIL": 11,
+    "IS_STOP": 12,
+    "IS_OOV_DEPRECATED": 13,
+    "IS_BRACKET": 14,
+    "IS_QUOTE": 15,
+    "IS_LEFT_PUNCT": 16,
+    "IS_RIGHT_PUNCT": 17,
+    "IS_CURRENCY": 18,
+    "FLAG19": 19,
+    "FLAG20": 20,
+    "FLAG21": 21,
+    "FLAG22": 22,
+    "FLAG23": 23,
+    "FLAG24": 24,
+    "FLAG25": 25,
+    "FLAG26": 26,
+    "FLAG27": 27,
+    "FLAG28": 28,
+    "FLAG29": 29,
+    "FLAG30": 30,
+    "FLAG31": 31,
+    "FLAG32": 32,
+    "FLAG33": 33,
+    "FLAG34": 34,
+    "FLAG35": 35,
+    "FLAG36": 36,
+    "FLAG37": 37,
+    "FLAG38": 38,
+    "FLAG39": 39,
+    "FLAG40": 40,
+    "FLAG41": 41,
+    "FLAG42": 42,
+    "FLAG43": 43,
+    "FLAG44": 44,
+    "FLAG45": 45,
+    "FLAG46": 46,
+    "FLAG47": 47,
+    "FLAG48": 48,
+    "FLAG49": 49,
+    "FLAG50": 50,
+    "FLAG51": 51,
+    "FLAG52": 52,
+    "FLAG53": 53,
+    "FLAG54": 54,
+    "FLAG55": 55,
+    "FLAG56": 56,
+    "FLAG57": 57,
+    "FLAG58": 58,
+    "FLAG59": 59,
+    "FLAG60": 60,
+    "FLAG61": 61,
+    "FLAG62": 62,
+    "FLAG63": 63,
+    "ID": 64,
+    "ORTH": 65,
+    "LOWER": 66,
+    "NORM": 67,
+    "SHAPE": 68,
+    "PREFIX": 69,
+    "SUFFIX": 70,
+    "LENGTH": 71,
+    "CLUSTER": 72,
+    "LEMMA": 73,
+    "POS": 74,
+    "TAG": 75,
+    "DEP": 76,
+    "ENT_IOB": 77,
+    "ENT_TYPE": 78,
+    "ENT_ID": 454,
+    "ENT_KB_ID": 452,
+    "HEAD": 79,
+    "SENT_START": 80,
+    "SPACY": 81,
+    "PROB": 82,
+    "LANG": 83,
+    "IDX": 455,
+    "ADJ": 84,
+    "ADP": 85,
+    "ADV": 86,
+    "AUX": 87,
+    "CONJ": 88,
+    "CCONJ": 89,
+    "DET": 90,
+    "INTJ": 91,
+    "NOUN": 92,
+    "NUM": 93,
+    "PART": 94,
+    "PRON": 95,
+    "PROPN": 96,
+    "PUNCT": 97,
+    "SCONJ": 98,
+    "SYM": 99,
+    "VERB": 100,
+    "X": 101,
+    "EOL": 102,
+    "SPACE": 103,
+    "DEPRECATED001": 104,
+    "DEPRECATED002": 105,
+    "DEPRECATED003": 106,
+    "DEPRECATED004": 107,
+    "DEPRECATED005": 108,
+    "DEPRECATED006": 109,
+    "DEPRECATED007": 110,
+    "DEPRECATED008": 111,
+    "DEPRECATED009": 112,
+    "DEPRECATED010": 113,
+    "DEPRECATED011": 114,
+    "DEPRECATED012": 115,
+    "DEPRECATED013": 116,
+    "DEPRECATED014": 117,
+    "DEPRECATED015": 118,
+    "DEPRECATED016": 119,
+    "DEPRECATED017": 120,
+    "DEPRECATED018": 121,
+    "DEPRECATED019": 122,
+    "DEPRECATED020": 123,
+    "DEPRECATED021": 124,
+    "DEPRECATED022": 125,
+    "DEPRECATED023": 126,
+    "DEPRECATED024": 127,
+    "DEPRECATED025": 128,
+    "DEPRECATED026": 129,
+    "DEPRECATED027": 130,
+    "DEPRECATED028": 131,
+    "DEPRECATED029": 132,
+    "DEPRECATED030": 133,
+    "DEPRECATED031": 134,
+    "DEPRECATED032": 135,
+    "DEPRECATED033": 136,
+    "DEPRECATED034": 137,
+    "DEPRECATED035": 138,
+    "DEPRECATED036": 139,
+    "DEPRECATED037": 140,
+    "DEPRECATED038": 141,
+    "DEPRECATED039": 142,
+    "DEPRECATED040": 143,
+    "DEPRECATED041": 144,
+    "DEPRECATED042": 145,
+    "DEPRECATED043": 146,
+    "DEPRECATED044": 147,
+    "DEPRECATED045": 148,
+    "DEPRECATED046": 149,
+    "DEPRECATED047": 150,
+    "DEPRECATED048": 151,
+    "DEPRECATED049": 152,
+    "DEPRECATED050": 153,
+    "DEPRECATED051": 154,
+    "DEPRECATED052": 155,
+    "DEPRECATED053": 156,
+    "DEPRECATED054": 157,
+    "DEPRECATED055": 158,
+    "DEPRECATED056": 159,
+    "DEPRECATED057": 160,
+    "DEPRECATED058": 161,
+    "DEPRECATED059": 162,
+    "DEPRECATED060": 163,
+    "DEPRECATED061": 164,
+    "DEPRECATED062": 165,
+    "DEPRECATED063": 166,
+    "DEPRECATED064": 167,
+    "DEPRECATED065": 168,
+    "DEPRECATED066": 169,
+    "DEPRECATED067": 170,
+    "DEPRECATED068": 171,
+    "DEPRECATED069": 172,
+    "DEPRECATED070": 173,
+    "DEPRECATED071": 174,
+    "DEPRECATED072": 175,
+    "DEPRECATED073": 176,
+    "DEPRECATED074": 177,
+    "DEPRECATED075": 178,
+    "DEPRECATED076": 179,
+    "DEPRECATED077": 180,
+    "DEPRECATED078": 181,
+    "DEPRECATED079": 182,
+    "DEPRECATED080": 183,
+    "DEPRECATED081": 184,
+    "DEPRECATED082": 185,
+    "DEPRECATED083": 186,
+    "DEPRECATED084": 187,
+    "DEPRECATED085": 188,
+    "DEPRECATED086": 189,
+    "DEPRECATED087": 190,
+    "DEPRECATED088": 191,
+    "DEPRECATED089": 192,
+    "DEPRECATED090": 193,
+    "DEPRECATED091": 194,
+    "DEPRECATED092": 195,
+    "DEPRECATED093": 196,
+    "DEPRECATED094": 197,
+    "DEPRECATED095": 198,
+    "DEPRECATED096": 199,
+    "DEPRECATED097": 200,
+    "DEPRECATED098": 201,
+    "DEPRECATED099": 202,
+    "DEPRECATED100": 203,
+    "DEPRECATED101": 204,
+    "DEPRECATED102": 205,
+    "DEPRECATED103": 206,
+    "DEPRECATED104": 207,
+    "DEPRECATED105": 208,
+    "DEPRECATED106": 209,
+    "DEPRECATED107": 210,
+    "DEPRECATED108": 211,
+    "DEPRECATED109": 212,
+    "DEPRECATED110": 213,
+    "DEPRECATED111": 214,
+    "DEPRECATED112": 215,
+    "DEPRECATED113": 216,
+    "DEPRECATED114": 217,
+    "DEPRECATED115": 218,
+    "DEPRECATED116": 219,
+    "DEPRECATED117": 220,
+    "DEPRECATED118": 221,
+    "DEPRECATED119": 222,
+    "DEPRECATED120": 223,
+    "DEPRECATED121": 224,
+    "DEPRECATED122": 225,
+    "DEPRECATED123": 226,
+    "DEPRECATED124": 227,
+    "DEPRECATED125": 228,
+    "DEPRECATED126": 229,
+    "DEPRECATED127": 230,
+    "DEPRECATED128": 231,
+    "DEPRECATED129": 232,
+    "DEPRECATED130": 233,
+    "DEPRECATED131": 234,
+    "DEPRECATED132": 235,
+    "DEPRECATED133": 236,
+    "DEPRECATED134": 237,
+    "DEPRECATED135": 238,
+    "DEPRECATED136": 239,
+    "DEPRECATED137": 240,
+    "DEPRECATED138": 241,
+    "DEPRECATED139": 242,
+    "DEPRECATED140": 243,
+    "DEPRECATED141": 244,
+    "DEPRECATED142": 245,
+    "DEPRECATED143": 246,
+    "DEPRECATED144": 247,
+    "DEPRECATED145": 248,
+    "DEPRECATED146": 249,
+    "DEPRECATED147": 250,
+    "DEPRECATED148": 251,
+    "DEPRECATED149": 252,
+    "DEPRECATED150": 253,
+    "DEPRECATED151": 254,
+    "DEPRECATED152": 255,
+    "DEPRECATED153": 256,
+    "DEPRECATED154": 257,
+    "DEPRECATED155": 258,
+    "DEPRECATED156": 259,
+    "DEPRECATED157": 260,
+    "DEPRECATED158": 261,
+    "DEPRECATED159": 262,
+    "DEPRECATED160": 263,
+    "DEPRECATED161": 264,
+    "DEPRECATED162": 265,
+    "DEPRECATED163": 266,
+    "DEPRECATED164": 267,
+    "DEPRECATED165": 268,
+    "DEPRECATED166": 269,
+    "DEPRECATED167": 270,
+    "DEPRECATED168": 271,
+    "DEPRECATED169": 272,
+    "DEPRECATED170": 273,
+    "DEPRECATED171": 274,
+    "DEPRECATED172": 275,
+    "DEPRECATED173": 276,
+    "DEPRECATED174": 277,
+    "DEPRECATED175": 278,
+    "DEPRECATED176": 279,
+    "DEPRECATED177": 280,
+    "DEPRECATED178": 281,
+    "DEPRECATED179": 282,
+    "DEPRECATED180": 283,
+    "DEPRECATED181": 284,
+    "DEPRECATED182": 285,
+    "DEPRECATED183": 286,
+    "DEPRECATED184": 287,
+    "DEPRECATED185": 288,
+    "DEPRECATED186": 289,
+    "DEPRECATED187": 290,
+    "DEPRECATED188": 291,
+    "DEPRECATED189": 292,
+    "DEPRECATED190": 293,
+    "DEPRECATED191": 294,
+    "DEPRECATED192": 295,
+    "DEPRECATED193": 296,
+    "DEPRECATED194": 297,
+    "DEPRECATED195": 298,
+    "DEPRECATED196": 299,
+    "DEPRECATED197": 300,
+    "DEPRECATED198": 301,
+    "DEPRECATED199": 302,
+    "DEPRECATED200": 303,
+    "DEPRECATED201": 304,
+    "DEPRECATED202": 305,
+    "DEPRECATED203": 306,
+    "DEPRECATED204": 307,
+    "DEPRECATED205": 308,
+    "DEPRECATED206": 309,
+    "DEPRECATED207": 310,
+    "DEPRECATED208": 311,
+    "DEPRECATED209": 312,
+    "DEPRECATED210": 313,
+    "DEPRECATED211": 314,
+    "DEPRECATED212": 315,
+    "DEPRECATED213": 316,
+    "DEPRECATED214": 317,
+    "DEPRECATED215": 318,
+    "DEPRECATED216": 319,
+    "DEPRECATED217": 320,
+    "DEPRECATED218": 321,
+    "DEPRECATED219": 322,
+    "DEPRECATED220": 323,
+    "DEPRECATED221": 324,
+    "DEPRECATED222": 325,
+    "DEPRECATED223": 326,
+    "DEPRECATED224": 327,
+    "DEPRECATED225": 328,
+    "DEPRECATED226": 329,
+    "DEPRECATED227": 330,
+    "DEPRECATED228": 331,
+    "DEPRECATED229": 332,
+    "DEPRECATED230": 333,
+    "DEPRECATED231": 334,
+    "DEPRECATED232": 335,
+    "DEPRECATED233": 336,
+    "DEPRECATED234": 337,
+    "DEPRECATED235": 338,
+    "DEPRECATED236": 339,
+    "DEPRECATED237": 340,
+    "DEPRECATED238": 341,
+    "DEPRECATED239": 342,
+    "DEPRECATED240": 343,
+    "DEPRECATED241": 344,
+    "DEPRECATED242": 345,
+    "DEPRECATED243": 346,
+    "DEPRECATED244": 347,
+    "DEPRECATED245": 348,
+    "DEPRECATED246": 349,
+    "DEPRECATED247": 350,
+    "DEPRECATED248": 351,
+    "DEPRECATED249": 352,
+    "DEPRECATED250": 353,
+    "DEPRECATED251": 354,
+    "DEPRECATED252": 355,
+    "DEPRECATED253": 356,
+    "DEPRECATED254": 357,
+    "DEPRECATED255": 358,
+    "DEPRECATED256": 359,
+    "DEPRECATED257": 360,
+    "DEPRECATED258": 361,
+    "DEPRECATED259": 362,
+    "DEPRECATED260": 363,
+    "DEPRECATED261": 364,
+    "DEPRECATED262": 365,
+    "DEPRECATED263": 366,
+    "DEPRECATED264": 367,
+    "DEPRECATED265": 368,
+    "DEPRECATED266": 369,
+    "DEPRECATED267": 370,
+    "DEPRECATED268": 371,
+    "DEPRECATED269": 372,
+    "DEPRECATED270": 373,
+    "DEPRECATED271": 374,
+    "DEPRECATED272": 375,
+    "DEPRECATED273": 376,
+    "DEPRECATED274": 377,
+    "DEPRECATED275": 378,
+    "DEPRECATED276": 379,
+    "PERSON": 380,
+    "NORP": 381,
+    "FACILITY": 382,
+    "ORG": 383,
+    "GPE": 384,
+    "LOC": 385,
+    "PRODUCT": 386,
+    "EVENT": 387,
+    "WORK_OF_ART": 388,
+    "LANGUAGE": 389,
+    "DATE": 391,
+    "TIME": 392,
+    "PERCENT": 393,
+    "MONEY": 394,
+    "QUANTITY": 395,
+    "ORDINAL": 396,
+    "CARDINAL": 397,
+    "acomp": 398,
+    "advcl": 399,
+    "advmod": 400,
+    "agent": 401,
+    "amod": 402,
+    "appos": 403,
+    "attr": 404,
+    "aux": 405,
+    "auxpass": 406,
+    "cc": 407,
+    "ccomp": 408,
+    "complm": 409,
+    "conj": 410,
+    "cop": 411,
+    "csubj": 412,
+    "csubjpass": 413,
+    "dep": 414,
+    "det": 415,
+    "dobj": 416,
+    "expl": 417,
+    "hmod": 418,
+    "hyph": 419,
+    "infmod": 420,
+    "intj": 421,
+    "iobj": 422,
+    "mark": 423,
+    "meta": 424,
+    "neg": 425,
+    "nmod": 426,
+    "nn": 427,
+    "npadvmod": 428,
+    "nsubj": 429,
+    "nsubjpass": 430,
+    "num": 431,
+    "number": 432,
+    "oprd": 433,
+    "obj": 434,
+    "obl": 435,
+    "parataxis": 436,
+    "partmod": 437,
+    "pcomp": 438,
+    "pobj": 439,
+    "poss": 440,
+    "possessive": 441,
+    "preconj": 442,
+    "prep": 443,
+    "prt": 444,
+    "punct": 445,
+    "quantmod": 446,
+    "rcmod": 448,
+    "relcl": 447,
+    "root": 449,
+    "xcomp": 450,
+    "acl": 451,
+    "LAW": 390,
+    "MORPH": 453,
+    "_": 456,
+}
+
+
+def test_frozen_symbols():
+    assert IDS == V3_SYMBOLS
+    assert NAMES == {v: k for k, v in IDS.items()}
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index f9a706f4a98..18cb08c7552 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1025,22 +1025,26 @@ cdef class Doc:
                 for id_ in py_attr_ids
             ]
         except KeyError as msg:
-            keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
+            keys = list(IDS.keys())
             raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
         # Make an array from the attributes --- otherwise our inner loop is
         # Python dict iteration.
-        cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i")
-        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
+        cdef Pool mem = Pool()
+        cdef int n_attrs = len(py_attr_ids)
+        cdef attr_id_t* c_attr_ids
+        if n_attrs > 0:
+            c_attr_ids = <attr_id_t*>mem.alloc(n_attrs, sizeof(attr_id_t))
+            for i, attr_id in enumerate(py_attr_ids):
+                c_attr_ids[i] = attr_id
+        output = numpy.ndarray(shape=(self.length, n_attrs), dtype=numpy.uint64)
         c_output = <attr_t*>output.data
-        c_attr_ids = <attr_id_t*>attr_ids.data
         cdef TokenC* token
-        cdef int nr_attr = attr_ids.shape[0]
         for i in range(self.length):
             token = &self.c[i]
-            for j in range(nr_attr):
-                c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j])
+            for j in range(n_attrs):
+                c_output[i*n_attrs + j] = get_token_attr(token, c_attr_ids[j])
         # Handle 1d case
-        return output if len(attr_ids) >= 2 else output.reshape((self.length,))
+        return output if n_attrs >= 2 else output.reshape((self.length,))
 
     def count_by(self, attr_id_t attr_id, exclude=None, object counts=None):
         """Count the frequencies of a given attribute. Produces a dict of

From 84b15a88d98f66a3a0d12f82477f2dcff3979a45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 13 Sep 2022 09:51:12 +0200
Subject: [PATCH 237/504] Store activations in `Doc`s when `save_activations`
 is enabled (#11002)

* Store activations in Doc when `store_activations` is enabled

This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.

As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.

* Change type of `store_activations` to `Union[bool, List[str]]`

When the value is:

- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored

* Formatting fixes in Tagger

* Support store_activations in spancat and morphologizer

* Make Doc.activations type visible to MyPy

* textcat/textcat_multilabel: add store_activations option

* trainable_lemmatizer/entity_linker: add store_activations option

* parser/ner: do not currently support returning activations

* Extend tagger and senter tests

So that they, like the other tests, also check that we get no
activations if no activations were requested.

* Document `Doc.activations` and `store_activations` in the relevant pipes

* Start errors/warnings at higher numbers to avoid merge conflicts

Between the master and v4 branches.

* Add `store_activations` to docstrings.

* Replace store_activations setter by set_store_activations method

Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.

* Use dict comprehension suggested by @svlandeg

* Revert "Use dict comprehension suggested by @svlandeg"

This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.

* EntityLinker: add type annotations to _add_activations

* _store_activations: make kwarg-only, remove doc_scores_lens arg

* set_annotations: add type annotations

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* TextCat.predict: return dict

* Make the `TrainablePipe.store_activations` property a bool

This means that we can also bring back `store_activations` setter.

* Remove `TrainablePipe.activations`

We do not need to enumerate the activations anymore since `store_activations` is
`bool`.

* Add type annotations for activations in predict/set_annotations

* Rename `TrainablePipe.store_activations` to `save_activations`

* Error E1400 is not used anymore

This error was used when activations were still `Union[bool, List[str]]`.

* Change wording in API docs after store -> save change

* docs: tag (save_)activations as new in spaCy 4.0

* Fix copied line in morphologizer activations test

* Don't train in any test_save_activations test

* Rename activations

- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
  "guesses" -> "tree_ids".

* Remove unused W400 warning.

This warning was used when we still allowed the user to specify
which activations to save.

* Formatting fixes

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Replace "kb_ids" by a constant

* spancat: replace a cast by an assertion

* Fix EOF spacing

* Fix comments in test_save_activations tests

* Do not set RNG seed in activation saving tests

* Revert "spancat: replace a cast by an assertion"

This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/edit_tree_lemmatizer.py        |  46 ++--
 spacy/pipeline/entity_linker.py               | 244 +++++++++++-------
 spacy/pipeline/morphologizer.pyx              |  37 ++-
 spacy/pipeline/senter.pyx                     |  38 ++-
 spacy/pipeline/spancat.py                     |  84 +++---
 spacy/pipeline/tagger.pyx                     |  43 ++-
 spacy/pipeline/textcat.py                     |  37 ++-
 spacy/pipeline/textcat_multilabel.py          |  23 +-
 spacy/pipeline/trainable_pipe.pxd             |   1 +
 spacy/pipeline/trainable_pipe.pyx             |  14 +-
 .../pipeline/test_edit_tree_lemmatizer.py     |  26 ++
 spacy/tests/pipeline/test_entity_linker.py    |  78 ++++--
 spacy/tests/pipeline/test_morphologizer.py    |  26 +-
 spacy/tests/pipeline/test_senter.py           |  25 ++
 spacy/tests/pipeline/test_spancat.py          |  34 +--
 spacy/tests/pipeline/test_tagger.py           |  24 +-
 spacy/tests/pipeline/test_textcat.py          |  64 +++--
 spacy/tokens/doc.pxd                          |   2 +
 spacy/tokens/doc.pyi                          |   3 +-
 spacy/tokens/doc.pyx                          |   1 +
 website/docs/api/doc.mdx                      |  33 +--
 website/docs/api/edittreelemmatizer.mdx       |  17 +-
 website/docs/api/entitylinker.mdx             |  29 +--
 website/docs/api/morphologizer.mdx            |  18 +-
 website/docs/api/sentencerecognizer.mdx       |  11 +-
 website/docs/api/spancategorizer.mdx          |  35 +--
 website/docs/api/tagger.mdx                   |  14 +-
 website/docs/api/textcategorizer.mdx          |  17 +-
 28 files changed, 669 insertions(+), 355 deletions(-)

diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 4a6174bc3d8..2ef639cad52 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -4,8 +4,8 @@
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints2d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
 from ..errors import Errors
@@ -22,6 +22,9 @@
 TOP_K_GUARDRAIL = 20
 
 
+ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
+
+
 default_model_config = """
 [model]
 @architectures = "spacy.Tagger.v2"
@@ -50,6 +53,7 @@
         "overwrite": False,
         "top_k": 1,
         "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+        "save_activations": False,
     },
     default_score_weights={"lemma_acc": 1.0},
 )
@@ -62,6 +66,7 @@ def make_edit_tree_lemmatizer(
     overwrite: bool,
     top_k: int,
     scorer: Optional[Callable],
+    save_activations: bool,
 ):
     """Construct an EditTreeLemmatizer component."""
     return EditTreeLemmatizer(
@@ -73,6 +78,7 @@ def make_edit_tree_lemmatizer(
         overwrite=overwrite,
         top_k=top_k,
         scorer=scorer,
+        save_activations=save_activations,
     )
 
 
@@ -92,6 +98,7 @@ def __init__(
         overwrite: bool = False,
         top_k: int = 1,
         scorer: Optional[Callable] = lemmatizer_score,
+        save_activations: bool = False,
     ):
         """
         Construct an edit tree lemmatizer.
@@ -103,6 +110,7 @@ def __init__(
             frequency in the training data.
         overwrite (bool): overwrite existing lemma annotations.
         top_k (int): try to apply at most the k most probable edit trees.
+        save_activations (bool): save model activations in Doc when annotating.
         """
         self.vocab = vocab
         self.model = model
@@ -117,7 +125,7 @@ def __init__(
 
         self.cfg: Dict[str, Any] = {"labels": []}
         self.scorer = scorer
-        self.numpy_ops = NumpyOps()
+        self.save_activations = save_activations
 
     def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
@@ -146,31 +154,24 @@ def get_loss(
 
         return float(loss), d_scores
 
-    def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
-        if self.top_k == 1:
-            scores2guesses = self._scores2guesses_top_k_equals_1
-        elif self.top_k <= TOP_K_GUARDRAIL:
-            scores2guesses = self._scores2guesses_top_k_greater_1
-        else:
-            scores2guesses = self._scores2guesses_top_k_guardrail
-        # The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
-        # of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
-        # for its principal purpose of lemmatizing tokens. However, the code could also
-        # be used for other purposes, and with very large values of *top_k* the method
-        # becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
-        # instead.
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         n_docs = len(list(docs))
         if not any(len(doc) for doc in docs):
             # Handle cases where there are no tokens in any docs.
             n_labels = len(self.cfg["labels"])
-            guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
+            guesses: List[Ints1d] = [
+                self.model.ops.alloc((0,), dtype="i") for doc in docs
+            ]
+            scores: List[Floats2d] = [
+                self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
+            ]
             assert len(guesses) == n_docs
-            return guesses
+            return {"probabilities": scores, "tree_ids": guesses}
         scores = self.model.predict(docs)
         assert len(scores) == n_docs
         guesses = scores2guesses(docs, scores)
         assert len(guesses) == n_docs
-        return guesses
+        return {"probabilities": scores, "tree_ids": guesses}
 
     def _scores2guesses_top_k_equals_1(self, docs, scores):
         guesses = []
@@ -230,8 +231,13 @@ def _scores2guesses_top_k_guardrail(self, docs, scores):
 
         return guesses
 
-    def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
+        batch_tree_ids = activations["tree_ids"]
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             doc_tree_ids = batch_tree_ids[i]
             if hasattr(doc_tree_ids, "get"):
                 doc_tree_ids = doc_tree_ids.get()
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 40a9c8a79dc..0f15ef38d45 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,3 +1,10 @@
+from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
+from typing import cast
+from numpy import dtype
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
+from pathlib import Path
+from itertools import islice
+import srsly
 import random
 from itertools import islice
 from pathlib import Path
@@ -20,6 +27,11 @@
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 
+
+ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
+
+KNOWLEDGE_BASE_IDS = "kb_ids"
+
 # See #9050
 BACKWARD_OVERWRITE = True
 
@@ -59,6 +71,7 @@
         "use_gold_ents": True,
         "candidates_batch_size": 1,
         "threshold": None,
+        "save_activations": False,
     },
     default_score_weights={
         "nel_micro_f": 1.0,
@@ -86,6 +99,7 @@ def make_entity_linker(
     use_gold_ents: bool,
     candidates_batch_size: int,
     threshold: Optional[float] = None,
+    save_activations: bool,
 ):
     """Construct an EntityLinker component.
 
@@ -109,6 +123,7 @@ def make_entity_linker(
     candidates_batch_size (int): Size of batches for entity candidate generation.
     threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
         prediction is discarded. If None, predictions are not filtered by any threshold.
+    save_activations (bool): save model activations in Doc when annotating.
     """
 
     if not model.attrs.get("include_span_maker", False):
@@ -143,6 +158,7 @@ def make_entity_linker(
         use_gold_ents=use_gold_ents,
         candidates_batch_size=candidates_batch_size,
         threshold=threshold,
+        save_activations=save_activations,
     )
 
 
@@ -184,6 +200,7 @@ def __init__(
         use_gold_ents: bool,
         candidates_batch_size: int,
         threshold: Optional[float] = None,
+        save_activations: bool = False,
     ) -> None:
         """Initialize an entity linker.
 
@@ -237,6 +254,7 @@ def __init__(
         self.use_gold_ents = use_gold_ents
         self.candidates_batch_size = candidates_batch_size
         self.threshold = threshold
+        self.save_activations = save_activations
 
         if candidates_batch_size < 1:
             raise ValueError(Errors.E1044)
@@ -442,7 +460,7 @@ def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
         loss = loss / len(entity_encodings)
         return float(loss), out
 
-    def predict(self, docs: Iterable[Doc]) -> List[str]:
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
         Returns the KB IDs for each entity in each doc, including NIL if there is
         no prediction.
@@ -455,129 +473,138 @@ def predict(self, docs: Iterable[Doc]) -> List[str]:
         self.validate_kb()
         entity_count = 0
         final_kb_ids: List[str] = []
-        xp = self.model.ops.xp
+        ops = self.model.ops
+        xp = ops.xp
+        docs_ents: List[Ragged] = []
+        docs_scores: List[Ragged] = []
         if not docs:
-            return final_kb_ids
+            return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
         if isinstance(docs, Doc):
             docs = [docs]
-        for i, doc in enumerate(docs):
+        for doc in docs:
+            doc_ents: List[Ints1d] = []
+            doc_scores: List[Floats1d] = []
             if len(doc) == 0:
+                docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
+                docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
                 continue
             sentences = [s for s in doc.sents]
 
-            # Loop over entities in batches.
-            for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
-                ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
-
-                # Look up candidate entities.
-                valid_ent_idx = [
-                    idx
-                    for idx in range(len(ent_batch))
-                    if ent_batch[idx].label_ not in self.labels_discard
-                ]
-
-                batch_candidates = list(
-                    self.get_candidates_batch(
-                        self.kb, [ent_batch[idx] for idx in valid_ent_idx]
-                    )
-                    if self.candidates_batch_size > 1
-                    else [
-                        self.get_candidates(self.kb, ent_batch[idx])
-                        for idx in valid_ent_idx
-                    ]
-                )
-
-                # Looping through each entity in batch (TODO: rewrite)
-                for j, ent in enumerate(ent_batch):
-                    assert hasattr(ent, "sents")
-                    sents = list(ent.sents)
-                    sent_indices = (
-                        sentences.index(sents[0]),
-                        sentences.index(sents[-1]),
+                if self.incl_context:
+                    # get n_neighbour sentences, clipped to the length of the document
+                    start_sentence = max(0, sent_index - self.n_sents)
+                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
+                    start_token = sentences[start_sentence].start
+                    end_token = sentences[end_sentence].end
+                    sent_doc = doc[start_token:end_token].as_doc()
+                    # currently, the context is the same for each entity in a sentence (should be refined)
+                    sentence_encoding = self.model.predict([sent_doc])[0]
+                    sentence_encoding_t = sentence_encoding.T
+                    sentence_norm = xp.linalg.norm(sentence_encoding_t)
+                entity_count += 1
+                if ent.label_ in self.labels_discard:
+                    # ignoring this entity - setting to NIL
+                    final_kb_ids.append(self.NIL)
+                    self._add_activations(
+                        doc_scores=doc_scores,
+                        doc_ents=doc_ents,
+                        scores=[0.0],
+                        ents=[0],
                     )
-                    assert sent_indices[1] >= sent_indices[0] >= 0
-
-                    if self.incl_context:
-                        # get n_neighbour sentences, clipped to the length of the document
-                        start_sentence = max(0, sent_indices[0] - self.n_sents)
-                        end_sentence = min(
-                            len(sentences) - 1, sent_indices[1] + self.n_sents
-                        )
-                        start_token = sentences[start_sentence].start
-                        end_token = sentences[end_sentence].end
-                        sent_doc = doc[start_token:end_token].as_doc()
-
-                        # currently, the context is the same for each entity in a sentence (should be refined)
-                        sentence_encoding = self.model.predict([sent_doc])[0]
-                        sentence_encoding_t = sentence_encoding.T
-                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                    entity_count += 1
-                    if ent.label_ in self.labels_discard:
-                        # ignoring this entity - setting to NIL
+                else:
+                    candidates = list(self.get_candidates(self.kb, ent))
+                    if not candidates:
+                        # no prediction possible for this entity - setting to NIL
                         final_kb_ids.append(self.NIL)
+                        self._add_activations(
+                            doc_scores=doc_scores,
+                            doc_ents=doc_ents,
+                            scores=[0.0],
+                            ents=[0],
+                        )
+                    elif len(candidates) == 1 and self.threshold is None:
+                        # shortcut for efficiency reasons: take the 1 candidate
+                        final_kb_ids.append(candidates[0].entity_)
+                        self._add_activations(
+                            doc_scores=doc_scores,
+                            doc_ents=doc_ents,
+                            scores=[1.0],
+                            ents=[candidates[0].entity_],
+                        )
                     else:
-                        candidates = list(batch_candidates[j])
-                        if not candidates:
-                            # no prediction possible for this entity - setting to NIL
-                            final_kb_ids.append(self.NIL)
-                        elif len(candidates) == 1 and self.threshold is None:
-                            # shortcut for efficiency reasons: take the 1 candidate
-                            final_kb_ids.append(candidates[0].entity_)
-                        else:
-                            random.shuffle(candidates)
-                            # set all prior probabilities to 0 if incl_prior=False
-                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
-                            if not self.incl_prior:
-                                prior_probs = xp.asarray([0.0 for _ in candidates])
-                            scores = prior_probs
-                            # add in similarity from the context
-                            if self.incl_context:
-                                entity_encodings = xp.asarray(
-                                    [c.entity_vector for c in candidates]
-                                )
-                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
-                                if len(entity_encodings) != len(prior_probs):
-                                    raise RuntimeError(
-                                        Errors.E147.format(
-                                            method="predict",
-                                            msg="vectors not of equal length",
-                                        )
+                        random.shuffle(candidates)
+                        # set all prior probabilities to 0 if incl_prior=False
+                        prior_probs = xp.asarray([c.prior_prob for c in candidates])
+                        if not self.incl_prior:
+                            prior_probs = xp.asarray([0.0 for _ in candidates])
+                        scores = prior_probs
+                        # add in similarity from the context
+                        if self.incl_context:
+                            entity_encodings = xp.asarray(
+                                [c.entity_vector for c in candidates]
+                            )
+                            entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+                            if len(entity_encodings) != len(prior_probs):
+                                raise RuntimeError(
+                                    Errors.E147.format(
+                                        method="predict",
+                                        msg="vectors not of equal length",
                                     )
-                                # cosine similarity
-                                sims = xp.dot(entity_encodings, sentence_encoding_t) / (
-                                    sentence_norm * entity_norm
                                 )
-                                if sims.shape != prior_probs.shape:
-                                    raise ValueError(Errors.E161)
-                                scores = prior_probs + sims - (prior_probs * sims)
-                            final_kb_ids.append(
-                                candidates[scores.argmax().item()].entity_
-                                if self.threshold is None
-                                or scores.max() >= self.threshold
-                                else EntityLinker.NIL
+                            # cosine similarity
+                            sims = xp.dot(entity_encodings, sentence_encoding_t) / (
+                                sentence_norm * entity_norm
                             )
-
+                            if sims.shape != prior_probs.shape:
+                                raise ValueError(Errors.E161)
+                            scores = prior_probs + sims - (prior_probs * sims)
+                        final_kb_ids.append(
+                            candidates[scores.argmax().item()].entity_
+                            if self.threshold is None or scores.max() >= self.threshold
+                            else EntityLinker.NIL
+                        )
+                        self._add_activations(
+                            doc_scores=doc_scores,
+                            doc_ents=doc_ents,
+                            scores=scores,
+                            ents=[c.entity for c in candidates],
+                        )
+            self._add_doc_activations(
+                docs_scores=docs_scores,
+                docs_ents=docs_ents,
+                doc_scores=doc_scores,
+                doc_ents=doc_ents,
+            )
         if not (len(final_kb_ids) == entity_count):
             err = Errors.E147.format(
                 method="predict", msg="result variables not of equal length"
             )
             raise RuntimeError(err)
-        return final_kb_ids
+        return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
 
-    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced
+                                 by EntityLinker.predict.
 
         DOCS: https://spacy.io/api/entitylinker#set_annotations
         """
+        kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
         count_ents = len([ent for doc in docs for ent in doc.ents])
         if count_ents != len(kb_ids):
             raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
         i = 0
         overwrite = self.cfg["overwrite"]
-        for doc in docs:
+        for j, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    if act_name != KNOWLEDGE_BASE_IDS:
+                        # We only copy activations that are Ragged.
+                        doc.activations[self.name][act_name] = cast(Ragged, acts[j])
+
             for ent in doc.ents:
                 kb_id = kb_ids[i]
                 i += 1
@@ -676,3 +703,32 @@ def rehearse(self, examples, *, sgd=None, losses=None, **config):
 
     def add_label(self, label):
         raise NotImplementedError
+
+    def _add_doc_activations(
+        self,
+        *,
+        docs_scores: List[Ragged],
+        docs_ents: List[Ragged],
+        doc_scores: List[Floats1d],
+        doc_ents: List[Ints1d],
+    ):
+        if not self.save_activations:
+            return
+        ops = self.model.ops
+        lengths = ops.asarray1i([s.shape[0] for s in doc_scores])
+        docs_scores.append(Ragged(ops.flatten(doc_scores), lengths))
+        docs_ents.append(Ragged(ops.flatten(doc_ents), lengths))
+
+    def _add_activations(
+        self,
+        *,
+        doc_scores: List[Floats1d],
+        doc_ents: List[Ints1d],
+        scores: Sequence[float],
+        ents: Sequence[int],
+    ):
+        if not self.save_activations:
+            return
+        ops = self.model.ops
+        doc_scores.append(ops.asarray1f(scores))
+        doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index bdbe75fd824..cc8f87936b9 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,4 +1,8 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Dict, Iterable, List, Optional, Union
+import srsly
+from thinc.api import SequenceCategoricalCrossentropy, Model, Config
+from thinc.types import Floats2d, Ints1d
 from itertools import islice
 from typing import Callable, Dict, Optional, Union
 
@@ -8,6 +12,12 @@ from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 
+from ..parts_of_speech import IDS as POS_IDS
+from ..symbols import POS
+from ..language import Language
+from ..errors import Errors
+from .pipe import deserialize_config
+from .tagger import ActivationsT, Tagger
 from .. import util
 from ..errors import Errors
 from ..language import Language
@@ -50,8 +60,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "morphologizer",
     assigns=["token.morph", "token.pos"],
-    default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False,
-                    "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0},
+    default_config={
+        "model": DEFAULT_MORPH_MODEL,
+        "overwrite": True,
+        "extend": False,
+        "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
+        "save_activations": False,
+    },
     default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
@@ -62,8 +77,10 @@ def make_morphologizer(
     extend: bool,
     label_smoothing: float,
     scorer: Optional[Callable],
+    save_activations: bool,
 ):
-    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer)
+    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
+                         save_activations=save_activations)
 
 
 def morphologizer_score(examples, **kwargs):
@@ -99,6 +116,7 @@ class Morphologizer(Tagger):
         extend: bool = BACKWARD_EXTEND,
         label_smoothing: float = 0.0,
         scorer: Optional[Callable] = morphologizer_score,
+        save_activations: bool = False,
     ):
         """Initialize a morphologizer.
 
@@ -109,6 +127,7 @@ class Morphologizer(Tagger):
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attributes "pos" and "morph" and
             Scorer.score_token_attr_per_feat for the attribute "morph".
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/morphologizer#init
         """
@@ -129,6 +148,7 @@ class Morphologizer(Tagger):
         }
         self.cfg = dict(sorted(cfg.items()))
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -222,14 +242,15 @@ class Morphologizer(Tagger):
         assert len(label_sample) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=doc_sample, Y=label_sample)
 
-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
 
         DOCS: https://spacy.io/api/morphologizer#set_annotations
         """
+        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
@@ -240,6 +261,10 @@ class Morphologizer(Tagger):
         # to allocate a compatible container out of the iterable.
         labels = tuple(self.labels)
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index df093baa9c6..521afe1d181 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,12 +1,16 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+from typing import Dict, Iterable, Optional, Callable, List, Union
 from itertools import islice
 from typing import Callable, Optional
 
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+import srsly
+from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
 
-from .. import util
+from .tagger import ActivationsT, Tagger
+from ..language import Language
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
@@ -37,11 +41,21 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "senter",
     assigns=["token.is_sent_start"],
-    default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
+    default_config={
+        "model": DEFAULT_SENTER_MODEL,
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.senter_scorer.v1"},
+        "save_activations": False,
+    },
     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
-def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
-    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
+def make_senter(nlp: Language,
+                name: str,
+                model: Model,
+                overwrite: bool,
+                scorer: Optional[Callable],
+                save_activations: bool):
+    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
 
 
 def senter_score(examples, **kwargs):
@@ -71,6 +85,7 @@ class SentenceRecognizer(Tagger):
         *,
         overwrite=BACKWARD_OVERWRITE,
         scorer=senter_score,
+        save_activations: bool = False,
     ):
         """Initialize a sentence recognizer.
 
@@ -80,6 +95,7 @@ class SentenceRecognizer(Tagger):
             losses during training.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_spans for the attribute "sents".
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/sentencerecognizer#init
         """
@@ -89,6 +105,7 @@ class SentenceRecognizer(Tagger):
         self._rehearsal_model = None
         self.cfg = {"overwrite": overwrite}
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -106,19 +123,24 @@ class SentenceRecognizer(Tagger):
     def label_data(self):
         return None
 
-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
 
         DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
         """
+        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
         cdef bint overwrite = self.cfg["overwrite"]
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 08a5478a912..1450bb5d6cb 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,6 +1,8 @@
-from dataclasses import dataclass
-from functools import partial
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
+from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
+from typing import Union
+from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
+from thinc.api import Optimizer
+from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
 
 import numpy
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
@@ -16,6 +18,9 @@
 from ..vocab import Vocab
 from .trainable_pipe import TrainablePipe
 
+ActivationsT = Dict[str, Union[Floats2d, Ragged]]
+
+
 spancat_default_config = """
 [model]
 @architectures = "spacy.SpanCategorizer.v1"
@@ -170,6 +175,7 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester:
         "model": DEFAULT_SPANCAT_MODEL,
         "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
         "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
+        "save_activations": False,
     },
     default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@@ -182,6 +188,7 @@ def make_spancat(
     scorer: Optional[Callable],
     threshold: float,
     max_positive: Optional[int],
+    save_activations: bool,
 ) -> "SpanCategorizer":
     """Create a SpanCategorizer component and configure it for multi-label
     classification to be able to assign multiple labels for each span.
@@ -209,6 +216,7 @@ def make_spancat(
         0.5.
     max_positive (Optional[int]): Maximum number of labels to consider positive
         per span. Defaults to None, indicating no limit.
+        save_activations (bool): save model activations in Doc when annotating.
     """
     return SpanCategorizer(
         nlp.vocab,
@@ -287,6 +295,7 @@ def make_spancat_singlelabel(
         add_negative_label=True,
         threshold=None,
         scorer=scorer,
+        save_activations=save_activations,
     )
 
 
@@ -349,6 +358,7 @@ def __init__(
         max_positive: Optional[int] = None,
         threshold: Optional[float] = 0.5,
         scorer: Optional[Callable] = spancat_score,
+        save_activations: bool = False,
     ) -> None:
         """Initialize the multi-label or multi-class span categorizer.
 
@@ -398,9 +408,7 @@ def __init__(
         self.model = model
         self.name = name
         self.scorer = scorer
-        self.add_negative_label = add_negative_label
-        if not allow_overlap and max_positive is not None and max_positive > 1:
-            raise ValueError(Errors.E1051.format(max_positive=max_positive))
+        self.save_activations = save_activations
 
     @property
     def key(self) -> str:
@@ -458,28 +466,7 @@ def label_data(self) -> List[str]:
         """
         return list(self.labels)
 
-    @property
-    def _label_map(self) -> Dict[str, int]:
-        """RETURNS (Dict[str, int]): The label map."""
-        return {label: i for i, label in enumerate(self.labels)}
-
-    @property
-    def _n_labels(self) -> int:
-        """RETURNS (int): Number of labels."""
-        if self.add_negative_label:
-            return len(self.labels) + 1
-        else:
-            return len(self.labels)
-
-    @property
-    def _negative_label_i(self) -> Union[int, None]:
-        """RETURNS (Union[int, None]): Index of the negative label."""
-        if self.add_negative_label:
-            return len(self.label_data)
-        else:
-            return None
-
-    def predict(self, docs: Iterable[Doc]):
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
 
         docs (Iterable[Doc]): The documents to predict.
@@ -488,11 +475,8 @@ def predict(self, docs: Iterable[Doc]):
         DOCS: https://spacy.io/api/spancategorizer#predict
         """
         indices = self.suggester(docs, ops=self.model.ops)
-        if indices.lengths.sum() == 0:
-            scores = self.model.ops.alloc2f(0, 0)
-        else:
-            scores = self.model.predict((docs, indices))  # type: ignore
-        return indices, scores
+        scores = self.model.predict((docs, indices))  # type: ignore
+        return {"indices": indices, "scores": scores}
 
     def set_candidates(
         self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@@ -512,32 +496,32 @@ def set_candidates(
             for index in candidates.dataXd:
                 doc.spans[candidates_key].append(doc[index[0] : index[1]])
 
-    def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of Doc objects, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        scores: The scores to set, produced by SpanCategorizer.predict.
+        activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
 
         DOCS: https://spacy.io/api/spancategorizer#set_annotations
         """
-        indices, scores = indices_scores
+        labels = self.labels
+
+        indices = activations["indices"]
+        assert isinstance(indices, Ragged)
+        scores = cast(Floats2d, activations["scores"])
+
         offset = 0
         for i, doc in enumerate(docs):
             indices_i = indices[i].dataXd
-            allow_overlap = cast(bool, self.cfg["allow_overlap"])
-            if self.cfg["max_positive"] == 1:
-                doc.spans[self.key] = self._make_span_group_singlelabel(
-                    doc,
-                    indices_i,
-                    scores[offset : offset + indices.lengths[i]],
-                    allow_overlap,
-                )
-            else:
-                doc.spans[self.key] = self._make_span_group_multilabel(
-                    doc,
-                    indices_i,
-                    scores[offset : offset + indices.lengths[i]],
-                )
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                doc.activations[self.name]["indices"] = indices_i
+                doc.activations[self.name]["scores"] = scores[
+                    offset : offset + indices.lengths[i]
+                ]
+            doc.spans[self.key] = self._make_span_group(
+                doc, indices_i, scores[offset : offset + indices.lengths[i]], labels  # type: ignore[arg-type]
+            )
             offset += indices.lengths[i]
 
     def update(
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 34e85d49c2b..8ecd0c46ee0 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,4 +1,10 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Dict, Iterable, List, Optional, Union
+import numpy
+import srsly
+from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
+import warnings
 from itertools import islice
 from typing import Callable, Optional
 
@@ -15,6 +21,9 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .trainable_pipe import TrainablePipe
 
+
+ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
+
 # See #9050
 BACKWARD_OVERWRITE = False
 
@@ -38,7 +47,13 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
 @Language.factory(
     "tagger",
     assigns=["token.tag"],
-    default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0},
+    default_config={
+        "model": DEFAULT_TAGGER_MODEL,
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.tagger_scorer.v1"},
+        "neg_prefix": "!",
+        "save_activations": False,
+    },
     default_score_weights={"tag_acc": 1.0},
 )
 def make_tagger(
@@ -48,7 +63,7 @@ def make_tagger(
     overwrite: bool,
     scorer: Optional[Callable],
     neg_prefix: str,
-    label_smoothing: float,
+    save_activations: bool,
 ):
     """Construct a part-of-speech tagger component.
 
@@ -57,7 +72,8 @@ def make_tagger(
         in size, and be normalized as probabilities (all scores between 0 and 1,
         with the rows summing to 1).
     """
-    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, label_smoothing=label_smoothing)
+    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
+                  save_activations=save_activations)
 
 
 def tagger_score(examples, **kwargs):
@@ -83,7 +99,7 @@ class Tagger(TrainablePipe):
         overwrite=BACKWARD_OVERWRITE,
         scorer=tagger_score,
         neg_prefix="!",
-        label_smoothing=0.0,
+        save_activations: bool = False,
     ):
         """Initialize a part-of-speech tagger.
 
@@ -93,6 +109,7 @@ class Tagger(TrainablePipe):
             losses during training.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attribute "tag".
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/tagger#init
         """
@@ -103,6 +120,7 @@ class Tagger(TrainablePipe):
         cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix, "label_smoothing": label_smoothing}
         self.cfg = dict(sorted(cfg.items()))
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -121,7 +139,7 @@ class Tagger(TrainablePipe):
         """Data about the labels currently added to the component."""
         return tuple(self.cfg["labels"])
 
-    def predict(self, docs):
+    def predict(self, docs) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
 
         docs (Iterable[Doc]): The documents to predict.
@@ -134,12 +152,12 @@ class Tagger(TrainablePipe):
             n_labels = len(self.labels)
             guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
             assert len(guesses) == len(docs)
-            return guesses
+            return {"probabilities": guesses, "label_ids": guesses}
         scores = self.model.predict(docs)
         assert len(scores) == len(docs), (len(scores), len(docs))
         guesses = self._scores2guesses(scores)
         assert len(guesses) == len(docs)
-        return guesses
+        return {"probabilities": scores, "label_ids": guesses}
 
     def _scores2guesses(self, scores):
         guesses = []
@@ -150,20 +168,25 @@ class Tagger(TrainablePipe):
             guesses.append(doc_guesses)
         return guesses
 
-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
-        batch_tag_ids: The IDs to set, produced by Tagger.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
 
         DOCS: https://spacy.io/api/tagger#set_annotations
         """
+        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
         cdef bint overwrite = self.cfg["overwrite"]
         labels = self.labels
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             doc_tag_ids = batch_tag_ids[i]
             if hasattr(doc_tag_ids, "get"):
                 doc_tag_ids = doc_tag_ids.get()
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index ae227017a9f..6cb33109891 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
+from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
+from thinc.types import Floats2d
+import numpy
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
 
@@ -14,6 +18,9 @@
 from ..vocab import Vocab
 from .trainable_pipe import TrainablePipe
 
+ActivationsT = Dict[str, Floats2d]
+
+
 single_label_default_config = """
 [model]
 @architectures = "spacy.TextCatEnsemble.v2"
@@ -80,7 +87,8 @@
     default_config={
         "threshold": 0.0,
         "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
-        "scorer": {"@scorers": "spacy.textcat_scorer.v2"},
+        "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
+        "save_activations": False,
     },
     default_score_weights={
         "cats_score": 1.0,
@@ -101,6 +109,7 @@ def make_textcat(
     model: Model[List[Doc], List[Floats2d]],
     threshold: float,
     scorer: Optional[Callable],
+    save_activations: bool,
 ) -> "TextCategorizer":
     """Create a TextCategorizer component. The text categorizer predicts categories
     over a whole document. It can learn one or more labels, and the labels are considered
@@ -110,8 +119,16 @@ def make_textcat(
         scores for each category.
     threshold (float): Cutoff to consider a prediction "positive".
     scorer (Optional[Callable]): The scoring method.
+    save_activations (bool): save model activations in Doc when annotating.
     """
-    return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
+    return TextCategorizer(
+        nlp.vocab,
+        model,
+        name,
+        threshold=threshold,
+        scorer=scorer,
+        save_activations=save_activations,
+    )
 
 
 def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
@@ -142,6 +159,7 @@ def __init__(
         *,
         threshold: float,
         scorer: Optional[Callable] = textcat_score,
+        save_activations: bool = False,
     ) -> None:
         """Initialize a text categorizer for single-label classification.
 
@@ -167,6 +185,7 @@ def __init__(
         }
         self.cfg = dict(cfg)
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def support_missing_values(self):
@@ -191,7 +210,7 @@ def label_data(self) -> List[str]:
         """
         return self.labels  # type: ignore[return-value]
 
-    def predict(self, docs: Iterable[Doc]):
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
 
         docs (Iterable[Doc]): The documents to predict.
@@ -204,12 +223,12 @@ def predict(self, docs: Iterable[Doc]):
             tensors = [doc.tensor for doc in docs]
             xp = self.model.ops.xp
             scores = xp.zeros((len(list(docs)), len(self.labels)))
-            return scores
+            return {"probabilities": scores}
         scores = self.model.predict(docs)
         scores = self.model.ops.asarray(scores)
-        return scores
+        return {"probabilities": scores}
 
-    def set_annotations(self, docs: Iterable[Doc], scores) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of Doc objects, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
@@ -217,9 +236,13 @@ def set_annotations(self, docs: Iterable[Doc], scores) -> None:
 
         DOCS: https://spacy.io/api/textcategorizer#set_annotations
         """
+        probs = activations["probabilities"]
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                doc.activations[self.name]["probabilities"] = probs[i]
             for j, label in enumerate(self.labels):
-                doc.cats[label] = float(scores[i, j])
+                doc.cats[label] = float(probs[i, j])
 
     def update(
         self,
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 2f8d5e60437..ac024ba3639 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Optional, Dict, List, Callable, Any, Union
+from thinc.types import Floats2d
+from thinc.api import Model, Config
+
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional
 
@@ -78,7 +82,8 @@
     default_config={
         "threshold": 0.5,
         "model": DEFAULT_MULTI_TEXTCAT_MODEL,
-        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
+        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
+        "save_activations": False,
     },
     default_score_weights={
         "cats_score": 1.0,
@@ -99,8 +104,9 @@ def make_multilabel_textcat(
     model: Model[List[Doc], List[Floats2d]],
     threshold: float,
     scorer: Optional[Callable],
-) -> "MultiLabel_TextCategorizer":
-    """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
+    save_activations: bool,
+) -> "TextCategorizer":
+    """Create a TextCategorizer component. The text categorizer predicts categories
     over a whole document. It can learn one or more labels, and the labels are considered
     to be non-mutually exclusive, which means that there can be zero or more labels
     per doc).
@@ -111,7 +117,12 @@ def make_multilabel_textcat(
     scorer (Optional[Callable]): The scoring method.
     """
     return MultiLabel_TextCategorizer(
-        nlp.vocab, model, name, threshold=threshold, scorer=scorer
+        nlp.vocab,
+        model,
+        name,
+        threshold=threshold,
+        scorer=scorer,
+        save_activations=save_activations,
     )
 
 
@@ -143,6 +154,7 @@ def __init__(
         *,
         threshold: float,
         scorer: Optional[Callable] = textcat_multilabel_score,
+        save_activations: bool = False,
     ) -> None:
         """Initialize a text categorizer for multi-label classification.
 
@@ -151,7 +163,7 @@ def __init__(
         name (str): The component instance name, used to add entries to the
             losses during training.
         threshold (float): Cutoff to consider a prediction "positive".
-        scorer (Optional[Callable]): The scoring method.
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/textcategorizer#init
         """
@@ -162,6 +174,7 @@ def __init__(
         cfg = {"labels": [], "threshold": threshold}
         self.cfg = dict(cfg)
         self.scorer = scorer
+        self.save_activations = save_activations
 
     @property
     def support_missing_values(self):
diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd
index b1d2550a1ce..3e9a0a9584d 100644
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@@ -7,3 +7,4 @@ cdef class TrainablePipe(Pipe):
     cdef public object model
     cdef public object cfg
     cdef public object scorer
+    cdef bint _save_activations
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 8f219b32797..bd360c9501b 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -2,10 +2,14 @@
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 
 import srsly
-from thinc.api import Model, Optimizer, set_dropout_rate
+from thinc.api import set_dropout_rate, Model, Optimizer
+import warnings
 
 from ..tokens.doc cimport Doc
 
+from ..training import validate_examples
+from ..errors import Errors, Warnings
+from .pipe import Pipe, deserialize_config
 from .. import util
 from ..errors import Errors
 from ..language import Language
@@ -342,3 +346,11 @@ cdef class TrainablePipe(Pipe):
         deserialize["model"] = load_model
         util.from_disk(path, deserialize, exclude)
         return self
+
+    @property
+    def save_activations(self):
+        return self._save_activations
+
+    @save_activations.setter
+    def save_activations(self, save_activations: bool):
+        self._save_activations = save_activations
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index 5a8f0aee2ab..ba2ed4e5ff3 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -1,3 +1,4 @@
+from typing import cast
 import pickle
 
 import hypothesis.strategies as st
@@ -8,6 +9,8 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
+from spacy.pipeline.trainable_pipe import TrainablePipe
+from spacy.training import Example
 from spacy.strings import StringStore
 from spacy.training import Example
 from spacy.util import make_tempdir
@@ -331,3 +334,26 @@ def test_empty_strings():
     no_change = trees.add("xyz", "xyz")
     empty = trees.add("", "")
     assert no_change == empty
+
+
+def test_save_activations():
+    nlp = English()
+    lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer"))
+    lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = lemmatizer.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "trainable_lemmatizer" not in doc.activations
+
+    lemmatizer.save_activations = True
+    doc = nlp("This is a test.")
+    assert list(doc.activations["trainable_lemmatizer"].keys()) == [
+        "probabilities",
+        "tree_ids",
+    ]
+    assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
+    assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 5e50a4d2801..32e7a265f37 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,7 +1,8 @@
-from typing import Any, Callable, Dict, Iterable, Tuple
+from typing import Callable, Iterable, Dict, Any, cast
 
 import pytest
 from numpy.testing import assert_equal
+from thinc.types import Ragged
 
 from spacy import Language, registry, util
 from spacy.attrs import ENT_KB_ID
@@ -9,8 +10,7 @@
 from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates
 from spacy.lang.en import English
 from spacy.ml import load_kb
-from spacy.ml.models.entity_linker import build_span_maker
-from spacy.pipeline import EntityLinker
+from spacy.pipeline import EntityLinker, TrainablePipe
 from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
@@ -1297,16 +1297,64 @@ def create_kb(vocab):
     assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
 
 
-def test_span_maker_forward_with_empty():
-    """The forward pass of the span maker may have a doc with no entities."""
+def test_save_activations():
     nlp = English()
-    doc1 = nlp("a b c")
-    ent = doc1[0:1]
-    ent.label_ = "X"
-    doc1.ents = [ent]
-    # no entities
-    doc2 = nlp("x y z")
-
-    # just to get a model
-    span_maker = build_span_maker()
-    span_maker([doc1, doc2], False)
+    vector_length = 3
+    assert "Q2146908" not in nlp.vocab.strings
+
+    # Convert the texts to docs to make sure we have doc.ents set for the training examples
+    train_examples = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp(text)
+        train_examples.append(Example.from_dict(doc, annotation))
+
+    def create_kb(vocab):
+        # create artificial KB - assign same prior weight to the two russ cochran's
+        # Q2146908 (Russ Cochran): American golfer
+        # Q7381115 (Russ Cochran): publisher
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
+        mykb.add_alias(
+            alias="Russ Cochran",
+            entities=["Q2146908", "Q7381115"],
+            probabilities=[0.5, 0.5],
+        )
+        return mykb
+
+    # Create the Entity Linker component and add it to the pipeline
+    entity_linker = cast(TrainablePipe, nlp.add_pipe("entity_linker", last=True))
+    assert isinstance(entity_linker, EntityLinker)
+    entity_linker.set_kb(create_kb)
+    assert "Q2146908" in entity_linker.vocab.strings
+    assert "Q2146908" in entity_linker.kb.vocab.strings
+
+    # initialize the NEL pipe
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    nO = entity_linker.model.get_dim("nO")
+
+    nlp.add_pipe("sentencizer", first=True)
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
+        {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]},
+    ]
+    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+    ruler.add_patterns(patterns)
+
+    doc = nlp("Russ Cochran was a publisher")
+    assert "entity_linker" not in doc.activations
+
+    entity_linker.save_activations = True
+    doc = nlp("Russ Cochran was a publisher")
+    assert set(doc.activations["entity_linker"].keys()) == {"ents", "scores"}
+    ents = doc.activations["entity_linker"]["ents"]
+    assert isinstance(ents, Ragged)
+    assert ents.data.shape == (2, 1)
+    assert ents.data.dtype == "uint64"
+    assert ents.lengths.shape == (1,)
+    scores = doc.activations["entity_linker"]["scores"]
+    assert isinstance(scores, Ragged)
+    assert scores.data.shape == (2, 1)
+    assert scores.data.dtype == "float32"
+    assert scores.lengths.shape == (1,)
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 0d895f23688..c2b65977ac3 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -1,3 +1,4 @@
+from typing import cast
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import get_current_ops
@@ -7,7 +8,8 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.morphology import Morphology
-from spacy.tests.util import make_tempdir
+from spacy.pipeline import TrainablePipe
+from spacy.attrs import MORPH
 from spacy.tokens import Doc
 from spacy.training import Example
 
@@ -224,3 +226,25 @@ def test_overfitting_IO():
     gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
     assert [str(t.morph) for t in doc] == gold_morphs
     assert [t.pos_ for t in doc] == gold_pos_tags
+
+
+def test_save_activations():
+    nlp = English()
+    morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
+    train_examples = []
+    for inst in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    doc = nlp("This is a test.")
+    assert "morphologizer" not in doc.activations
+
+    morphologizer.save_activations = True
+    doc = nlp("This is a test.")
+    assert "morphologizer" in doc.activations
+    assert set(doc.activations["morphologizer"].keys()) == {
+        "label_ids",
+        "probabilities",
+    }
+    assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
+    assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 6c76558123f..2e40d86ff48 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -1,3 +1,4 @@
+from typing import cast
 import pytest
 from numpy.testing import assert_equal
 
@@ -5,6 +6,7 @@
 from spacy.attrs import SENT_START
 from spacy.lang.en import English
 from spacy.language import Language
+from spacy.pipeline import TrainablePipe
 from spacy.tests.util import make_tempdir
 from spacy.training import Example
 
@@ -101,3 +103,26 @@ def test_overfitting_IO():
     # test internal pipe labels vs. Language.pipe_labels with hidden labels
     assert nlp.get_pipe("senter").labels == ("I", "S")
     assert "senter" not in nlp.pipe_labels
+
+
+def test_save_activations():
+    # Test if activations are correctly added to Doc when requested.
+    nlp = English()
+    senter = cast(TrainablePipe, nlp.add_pipe("senter"))
+
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = senter.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "senter" not in doc.activations
+
+    senter.save_activations = True
+    doc = nlp("This is a test.")
+    assert "senter" in doc.activations
+    assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
+    assert doc.activations["senter"]["probabilities"].shape == (5, nO)
+    assert doc.activations["senter"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index c143d193fa6..9678e9b63b8 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -594,19 +594,21 @@ def test_set_candidates(name):
     assert docs[0].spans["candidates"][4].text == "Just a"
 
 
-@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
-@pytest.mark.parametrize("n_process", [1, 2])
-def test_spancat_multiprocessing(name, n_process):
-    if isinstance(get_current_ops, NumpyOps) or n_process < 2:
-        nlp = Language()
-        spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
-        train_examples = make_examples(nlp)
-        nlp.initialize(get_examples=lambda: train_examples)
-        texts = [
-            "Just a sentence.",
-            "I like London and Berlin",
-            "I like Berlin",
-            "I eat ham.",
-        ]
-        docs = list(nlp.pipe(texts, n_process=n_process))
-        assert len(docs) == len(texts)
+def test_save_activations():
+    # Test if activations are correctly added to Doc when requested.
+    nlp = English()
+    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
+    train_examples = make_examples(nlp)
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = spancat.model.get_dim("nO")
+    assert nO == 2
+    assert set(spancat.labels) == {"LOC", "PERSON"}
+
+    doc = nlp("This is a test.")
+    assert "spancat" not in doc.activations
+
+    spancat.save_activations = True
+    doc = nlp("This is a test.")
+    assert set(doc.activations["spancat"].keys()) == {"indices", "scores"}
+    assert doc.activations["spancat"]["indices"].shape == (12, 2)
+    assert doc.activations["spancat"]["scores"].shape == (12, nO)
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 4b5f1ee99fc..5deb323dd71 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,3 +1,4 @@
+from typing import cast
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import compounding, get_current_ops
@@ -6,7 +7,8 @@
 from spacy.attrs import TAG
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.training import Example
+from spacy.pipeline import TrainablePipe
+from thinc.api import compounding
 
 from ..util import make_tempdir
 
@@ -235,6 +237,26 @@ def test_overfitting_IO():
     assert doc3[0].tag_ != "N"
 
 
+def test_save_activations():
+    # Test if activations are correctly added to Doc when requested.
+    nlp = English()
+    tagger = cast(TrainablePipe, nlp.add_pipe("tagger"))
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    doc = nlp("This is a test.")
+    assert "tagger" not in doc.activations
+
+    tagger.save_activations = True
+    doc = nlp("This is a test.")
+    assert "tagger" in doc.activations
+    assert set(doc.activations["tagger"].keys()) == {"label_ids", "probabilities"}
+    assert doc.activations["tagger"]["probabilities"].shape == (5, len(TAGS))
+    assert doc.activations["tagger"]["label_ids"].shape == (5,)
+
+
 def test_tagger_requires_labels():
     nlp = English()
     nlp.add_pipe("tagger")
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 8a0c1a9760d..710dac0571d 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,3 +1,4 @@
+from typing import cast
 import random
 
 import numpy.random
@@ -11,17 +12,13 @@
 from spacy.cli.evaluate import print_prf_per_type, print_textcats_auc_per_cat
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import TextCategorizer
-from spacy.pipeline.textcat import (
-    single_label_bow_config,
-    single_label_cnn_config,
-    single_label_default_config,
-)
-from spacy.pipeline.textcat_multilabel import (
-    multi_label_bow_config,
-    multi_label_cnn_config,
-    multi_label_default_config,
-)
+from spacy.pipeline import TextCategorizer, TrainablePipe
+from spacy.pipeline.textcat import single_label_bow_config
+from spacy.pipeline.textcat import single_label_cnn_config
+from spacy.pipeline.textcat import single_label_default_config
+from spacy.pipeline.textcat_multilabel import multi_label_bow_config
+from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
+from spacy.pipeline.textcat_multilabel import multi_label_default_config
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, DocBin
@@ -298,7 +295,7 @@ def test_issue9904():
     nlp.initialize(get_examples)
 
     examples = get_examples()
-    scores = textcat.predict([eg.predicted for eg in examples])
+    scores = textcat.predict([eg.predicted for eg in examples])["probabilities"]
 
     loss = textcat.get_loss(examples, scores)[0]
     loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0]
@@ -949,24 +946,39 @@ def test_textcat_multi_threshold():
     assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
 
 
-@pytest.mark.parametrize(
-    "component_name,scorer",
-    [
-        ("textcat", "spacy.textcat_scorer.v1"),
-        ("textcat_multilabel", "spacy.textcat_multilabel_scorer.v1"),
-    ],
-)
-def test_textcat_legacy_scorers(component_name, scorer):
-    """Check that legacy scorers are registered and produce the expected score
-    keys."""
+def test_save_activations():
     nlp = English()
-    nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}})
+    textcat = cast(TrainablePipe, nlp.add_pipe("textcat"))
 
     train_examples = []
     for text, annotations in TRAIN_DATA_SINGLE_LABEL:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
     nlp.initialize(get_examples=lambda: train_examples)
+    nO = textcat.model.get_dim("nO")
 
-    # score the model (it's not actually trained but that doesn't matter)
-    scores = nlp.evaluate(train_examples)
-    assert 0 <= scores["cats_score"] <= 1
+    doc = nlp("This is a test.")
+    assert "textcat" not in doc.activations
+
+    textcat.save_activations = True
+    doc = nlp("This is a test.")
+    assert list(doc.activations["textcat"].keys()) == ["probabilities"]
+    assert doc.activations["textcat"]["probabilities"].shape == (nO,)
+
+
+def test_save_activations_multi():
+    nlp = English()
+    textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel"))
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA_MULTI_LABEL:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = textcat.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "textcat_multilabel" not in doc.activations
+
+    textcat.save_activations = True
+    doc = nlp("This is a test.")
+    assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
+    assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index d9719609cdc..5e8975ed337 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -50,6 +50,8 @@ cdef class Doc:
 
     cdef public float sentiment
 
+    cdef public dict activations
+
     cdef public dict user_hooks
     cdef public dict user_token_hooks
     cdef public dict user_span_hooks
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 0fae118b4b6..5fda6f2f789 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -16,7 +16,7 @@ from typing import (
 
 import numpy as np
 from cymem.cymem import Pool
-from thinc.types import Floats1d, Floats2d, Ints2d
+from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
 from .span import Span
 from .token import Token
 from .span_groups import SpanGroups
@@ -41,6 +41,7 @@ class Doc:
     max_length: int
     length: int
     sentiment: float
+    activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
     cats: Dict[str, float]
     user_hooks: Dict[str, Callable[..., Any]]
     user_token_hooks: Dict[str, Callable[..., Any]]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 18cb08c7552..ed9e4cd999d 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -281,6 +281,7 @@ cdef class Doc:
         self.length = 0
         self.sentiment = 0.0
         self.cats = {}
+        self.activations = {}
         self.user_hooks = {}
         self.user_token_hooks = {}
         self.user_span_hooks = {}
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index 0a582650076..310ce0dc88d 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -752,22 +752,23 @@ The L2 norm of the document's vector representation.
 
 ## Attributes {id="attributes"}
 
-| Name                 | Description                                                                                                                         |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `text`               | A string representation of the document text. ~~str~~                                                                               |
-| `text_with_ws`       | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                       |
-| `mem`                | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                            |
-| `vocab`              | The store of lexical types. ~~Vocab~~                                                                                               |
-| `tensor`             | Container for dense vector representations. ~~numpy.ndarray~~                                                                       |
-| `user_data`          | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                    |
-| `lang`               | Language of the document's vocabulary. ~~int~~                                                                                      |
-| `lang_`              | Language of the document's vocabulary. ~~str~~                                                                                      |
-| `sentiment`          | The document's positivity/negativity score, if available. ~~float~~                                                                 |
-| `user_hooks`         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                           |
-| `user_token_hooks`   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                   |
-| `user_span_hooks`    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                    |
-| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
-| `_`                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~       |
+| Name                                       | Description                                                                                                                                    |
+| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `text`                                     | A string representation of the document text. ~~str~~                                                                                          |
+| `text_with_ws`                             | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                                  |
+| `mem`                                      | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                       |
+| `vocab`                                    | The store of lexical types. ~~Vocab~~                                                                                                          |
+| `tensor` <Tag variant="new">2</Tag>        | Container for dense vector representations. ~~numpy.ndarray~~                                                                                  |
+| `user_data`                                | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                               |
+| `lang` <Tag variant="new">2.1</Tag>        | Language of the document's vocabulary. ~~int~~                                                                                                 |
+| `lang_` <Tag variant="new">2.1</Tag>       | Language of the document's vocabulary. ~~str~~                                                                                                 |
+| `sentiment`                                | The document's positivity/negativity score, if available. ~~float~~                                                                            |
+| `user_hooks`                               | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                      |
+| `user_token_hooks`                         | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                              |
+| `user_span_hooks`                          | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                               |
+| `has_unknown_spaces`                       | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~            |
+| `_`                                        | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                  |
+| `activations` <Tag variant="new">4.0</Tag> | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ |
 
 ## Serialization fields {id="serialization-fields"}
 
diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx
index 82967482c90..17af19e8c38 100644
--- a/website/docs/api/edittreelemmatizer.mdx
+++ b/website/docs/api/edittreelemmatizer.mdx
@@ -44,14 +44,15 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("trainable_lemmatizer", config=config, name="lemmatizer")
 > ```
 
-| Setting         | Description                                                                                                                                                                                                                                                                                                        |
-| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`         | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `backoff`       | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~                                                                                                                                                                                                                      |
-| `min_tree_freq` | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~                                                                                                                                                                                                                         |
-| `overwrite`     | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                                          |
-| `top_k`         | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~                                                                                                                                                                                                              |
-| `scorer`        | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~                                                                                                                                                                      |
+| Setting                                         | Description                                                                                                                                                                                                                                                                                                        |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                         | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `backoff`                                       | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~                                                                                                                                                                                                                      |
+| `min_tree_freq`                                 | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~                                                                                                                                                                                                                         |
+| `overwrite`                                     | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                                          |
+| `top_k`                                         | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~                                                                                                                                                                                                              |
+| `scorer`                                        | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~                                                                                                                                                                      |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"tree_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                                                                    |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/edit_tree_lemmatizer.py
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index c7b11985aea..85b872151fd 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -53,21 +53,20 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("entity_linker", config=config)
 > ```
 
-| Setting                                             | Description                                                                                                                                                                                                                                                                                                      |
-| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels_discard`                                    | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                                                   |
-| `n_sents`                                           | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                                                |
-| `incl_prior`                                        | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                             |
-| `incl_context`                                      | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                           |
-| `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           |
-| `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    |
-| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~                                                                                                                            |
-| `get_candidates`                                    | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                         |
-| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
-| `generate_empty_kb` <Tag variant="new">3.5.1</Tag>  | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           |
-| `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         |
-| `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          |
-| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                      |
+| Setting                                         | Description                                                                                                                                                                                                                                                                                 |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `labels_discard`                                | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                              |
+| `n_sents`                                       | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                           |
+| `incl_prior`                                    | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                        |
+| `incl_context`                                  | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                      |
+| `model`                                         | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                      |
+| `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
+| `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
+| `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                    |
+| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
+| `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index ce16f534219..1fda807cb32 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -42,13 +42,13 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("morphologizer", config=config)
 > ```
 
-| Setting                                        | Description                                                                                                                                                                                                                                                            |
-| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model`                                        | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
-| `overwrite` <Tag variant="new">3.2</Tag>       | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
-| `extend` <Tag variant="new">3.2</Tag>          | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
-| `scorer` <Tag variant="new">3.2</Tag>          | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
-| `label_smoothing` <Tag variant="new">3.6</Tag> | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~                                                                                                                                                                               |
+| Setting                                         | Description                                                                                                                                                                                                                                                            |
+| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model`                                         | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
+| `extend` <Tag variant="new">3.2</Tag>           | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
+| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                       |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx
@@ -400,8 +400,8 @@ coarse-grained POS as the feature `POS`.
 > assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels
 > ```
 
-| Name        | Description                                            |
-| ----------- | ------------------------------------------------------ |
+| Name        | Description                                               |
+| ----------- | --------------------------------------------------------- |
 | **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ |
 
 ## Morphologizer.label_data {id="label_data",tag="property",version="3"}
diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx
index 5435399f956..d5d096d7659 100644
--- a/website/docs/api/sentencerecognizer.mdx
+++ b/website/docs/api/sentencerecognizer.mdx
@@ -39,11 +39,12 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("senter", config=config)
 > ```
 
-| Setting                                  | Description                                                                                                                                                           |
-| ---------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                             |
-| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~                                   |
+| Setting                                         | Description                                                                                                                                                           |
+| ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model`                                         | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                             |
+| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~                                   |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                      |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/senter.pyx
diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx
index 98a1948eeab..258db794786 100644
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@@ -62,32 +62,15 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("spancat", config=config)
 > ```
 
-> #### Example (spancat_singlelabel)
->
-> ```python
-> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
-> config = {
->     "spans_key": "labeled_spans",
->     "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
->     "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
->     # Additional spancat_singlelabel parameters
->     "negative_weight": 0.8,
->     "allow_overlap": True,
-> }
-> nlp.add_pipe("spancat_singlelabel", config=config)
-> ```
-
-| Setting                                             | Description                                                                                                                                                                                                                                                                                             |
-| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `suggester`                                         | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~                                                  |
-| `model`                                             | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
-| `spans_key`                                         | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~                                                                                  |
-| `threshold`                                         | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Meant to be used in combination with the multi-class `spancat` component with a `Logistic` scoring layer. Defaults to `0.5`. ~~float~~                                                |
-| `max_positive`                                      | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. Meant to be used together with the `spancat` component and defaults to 0 with `spancat_singlelabel`. ~~Optional[int]~~                                                                                 |
-| `scorer`                                            | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~                                                                                                                                       |
-| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span` . This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel`. Spans with negative labels and their scores are not stored as annotations. ~~bool~~            |
-| `negative_weight` <Tag variant="new">3.5.1</Tag>    | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~                                                                                                               |
-| `allow_overlap` <Tag variant="new">3.5.1</Tag>      | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~                                                                                                                                                        |
+| Setting                                         | Description                                                                                                                                                                                                                                                                                             |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `suggester`                                     | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~                                                  |
+| `model`                                         | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
+| `spans_key`                                     | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~                                                                                  |
+| `threshold`                                     | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~                                                                                                                                                          |
+| `max_positive`                                  | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~                                                                                                                                                                                      |
+| `scorer`                                        | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~                                                                                                                                       |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"indices"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                                 |
 
 <Infobox variant="warning">
 
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index d9b0506fb17..20852e8eb94 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -40,13 +40,13 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("tagger", config=config)
 > ```
 
-| Setting                                        | Description                                                                                                                                                                                                                                                                                            |
-| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                                        | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `overwrite` <Tag variant="new">3.2</Tag>       | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                              |
-| `scorer` <Tag variant="new">3.2</Tag>          | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~                                                                                                                                                            |
-| `neg_prefix` <Tag variant="new">3.2.1</Tag>    | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~                                                                                                                                                              |
-| `label_smoothing` <Tag variant="new">3.6</Tag> | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~                                                                                                                                                                                                               |
+| Setting                                         | Description                                                                                                                                                                                                                                                                                            |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                         | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                              |
+| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~                                                                                                                                                            |
+| `neg_prefix` <Tag variant="new">3.2.1</Tag>     | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~                                                                                                                                                              |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                                                       |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/tagger.pyx
diff --git a/website/docs/api/textcategorizer.mdx b/website/docs/api/textcategorizer.mdx
index a259b7b3c65..a1dfb6dd88e 100644
--- a/website/docs/api/textcategorizer.mdx
+++ b/website/docs/api/textcategorizer.mdx
@@ -116,14 +116,15 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#create_pipe).
 
-| Name           | Description                                                                                                                      |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                                 |
-| `model`        | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~       |
-| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
-| _keyword-only_ |                                                                                                                                  |
-| `threshold`    | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~        |
-| `scorer`       | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
+| Name                                            | Description                                                                                                                      |
+| ----------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                         | The shared vocabulary. ~~Vocab~~                                                                                                 |
+| `model`                                         | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~       |
+| `name`                                          | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
+| _keyword-only_                                  |                                                                                                                                  |
+| `threshold`                                     | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                   |
+| `scorer`                                        | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. The supported activations is `"probabilities"`. ~~Union[bool, list[str]]~~            |
 
 ## TextCategorizer.\_\_call\_\_ {id="call",tag="method"}
 

From ae8725980577ff54937ceb0f4650ca486e475163 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 15 Sep 2022 17:06:58 +0200
Subject: [PATCH 238/504] disable mypy run for Python 3.10 (#11508) (#11512)

---
 .github/azure-steps.yml | 117 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 .github/azure-steps.yml

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
new file mode 100644
index 00000000000..c7722391fec
--- /dev/null
+++ b/.github/azure-steps.yml
@@ -0,0 +1,117 @@
+parameters:
+  python_version: ''
+  architecture: ''
+  prefix: ''
+  gpu: false
+  num_build_jobs: 1
+
+steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: ${{ parameters.python_version }}
+      architecture: ${{ parameters.architecture }}
+
+  - bash: |
+      echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
+    displayName: 'Set variables'
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U pip setuptools
+      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+    displayName: "Install dependencies"
+
+  - script: |
+      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
+      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
+    displayName: "Compile and build sdist"
+
+  - script: python -m mypy spacy
+    displayName: 'Run mypy'
+    condition: ne(variables['python_version'], '3.10')
+
+  - task: DeleteFiles@1
+    inputs:
+      contents: "spacy"
+    displayName: "Delete source directory"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
+      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+    displayName: "Uninstall all packages"
+
+  - bash: |
+      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+      ${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
+    displayName: "Install from sdist"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+    displayName: "Install test requirements"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
+      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
+    displayName: "Install GPU requirements"
+    condition: eq(${{ parameters.gpu }}, true)
+
+  - script: |
+      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
+    displayName: "Run CPU tests"
+    condition: eq(${{ parameters.gpu }}, false)
+
+  - script: |
+      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
+    displayName: "Run GPU tests"
+    condition: eq(${{ parameters.gpu }}, true)
+
+  - script: |
+      python -m spacy download ca_core_news_sm
+      python -m spacy download ca_core_news_md
+      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+    displayName: 'Test download CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
+    displayName: 'Test convert CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -m spacy init config -p ner -l ca ner.cfg
+      python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
+    displayName: 'Test debug config CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      # will have errors due to sparse data, check for summary in output
+      python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
+    displayName: 'Test debug data CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
+    displayName: 'Test train CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+    displayName: 'Test assemble CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+    displayName: 'Test assemble CLI vectors warning'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python .github/validate_universe_json.py website/meta/universe.json
+    displayName: 'Test website/meta/universe.json'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
+      ${{ parameters.prefix }} python -m pytest --pyargs spacy
+    displayName: "Run CPU tests with thinc-apple-ops"
+    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))

From 0c30fb9a66d465cb87fd58902c93b24b65b0e278 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 3 Oct 2022 14:41:15 +0200
Subject: [PATCH 239/504] fix test for EL activations with refactored KB

---
 spacy/tests/pipeline/test_entity_linker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 32e7a265f37..55726e401d3 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1312,7 +1312,7 @@ def create_kb(vocab):
         # create artificial KB - assign same prior weight to the two russ cochran's
         # Q2146908 (Russ Cochran): American golfer
         # Q7381115 (Russ Cochran): publisher
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
         mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
         mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
         mykb.add_alias(

From 7513bcb1532a67f80cab7d30c9ee1d3f77d22fb7 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 6 Oct 2022 10:51:06 +0200
Subject: [PATCH 240/504] `StringStore` refactoring (#11344)

* `strings`: Remove unused `hash32_utf8` function

* `strings`: Make `hash_utf8` and `decode_Utf8Str` private

* `strings`: Reorganize private functions

* 'strings': Raise error when non-string/-int types are passed to functions that don't accept them

* `strings`: Add `items()` method, add type hints, remove unused methods, restrict inputs to specific types, reorganize methods

* `Morphology`: Use `StringStore.items()` to enumerate features when pickling

* `test_stringstore`: Update pre-Python 3 tests

* Update `StringStore` docs

* Fix `get_string_id` imports

* Replace redundant test with tests for type checking

* Rename `_retrieve_interned_str`, remove `.get` default arg

* Add `get_string_id` to `strings.pyi`
Remove `mypy` ignore directives from imports of the above

* `strings.pyi`: Replace functions that consume `Union`-typed params with overloads

* `strings.pyi`: Revert some function signatures

* Update `SYMBOLS_BY_INT` lookups and error codes post-merge

* Revert clobbered change introduced in a previous merge

* Remove unnecessary type hint

* Invert tuple order in `StringStore.items()`

* Add test for `StringStore.items()`

* Revert "`Morphology`: Use `StringStore.items()` to enumerate features when pickling"

This reverts commit 1af9510ceb6b08cfdcfbf26df6896f26709fac0d.

* Rename `keys` and `key_map`

* Add `keys()` and `values()`

* Add comment about the inverted key-value semantics in the API

* Fix type hints

* Implement `keys()`, `values()`, `items()` without generators

* Fix type hints, remove unnecessary boxing

* Update docs

* Simplify `keys/values/items()` impl

* `mypy` fix

* Fix error message, doc fixes
---
 spacy/errors.py                               |   4 +-
 spacy/matcher/matcher.pyx                     |   3 +
 spacy/strings.pxd                             |  22 +-
 spacy/strings.pyi                             |  22 +-
 spacy/strings.pyx                             | 410 +++++++++---------
 spacy/tests/vocab_vectors/test_stringstore.py |  41 +-
 spacy/tokens/graph.pyx                        |   4 +-
 spacy/tokens/retokenizer.pyx                  |   4 +-
 website/docs/api/stringstore.mdx              |  82 +++-
 9 files changed, 334 insertions(+), 258 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 146c60b6d60..9814679eb7d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -258,7 +258,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
     E016 = ("MultitaskObjective target should be function or one of: dep, "
             "tag, ent, dep_tag_offset, ent_tag.")
-    E017 = ("Can only add unicode or bytes. Got type: {value_type}")
+    E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
     E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
             "refers to an issue with the `Vocab` or `StringStore`.")
     E019 = ("Can't create transition with unknown action ID: {action}. Action "
@@ -991,6 +991,8 @@ class Errors(metaclass=ErrorsWithCodes):
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
+    E4001 = ("Expected input to be one of the following types: ({expected_types}), "
+             "but got '{received_type}'")
 
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 715dd45f07c..7e734ac247e 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -20,6 +20,9 @@ from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
 
+from ..schemas import validate_token_pattern
+from ..errors import Errors, MatchPatternError, Warnings
+from ..strings cimport get_string_id
 from ..attrs import IDS
 from ..errors import Errors, MatchPatternError, Warnings
 from ..schemas import validate_token_pattern
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index d22f48ba133..b734a707c54 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,3 +1,6 @@
+from libc.stdint cimport int64_t, uint32_t
+from libcpp.vector cimport vector
+from libcpp.set cimport set
 from cymem.cymem cimport Pool
 from libc.stdint cimport int64_t
 from libcpp.set cimport set
@@ -7,13 +10,6 @@ from preshed.maps cimport PreshMap
 
 from .typedefs cimport attr_t, hash_t
 
-
-cpdef hash_t hash_string(str string) except 0
-cdef hash_t hash_utf8(char* utf8_string, int length) nogil
-
-cdef str decode_Utf8Str(const Utf8Str* string)
-
-
 ctypedef union Utf8Str:
     unsigned char[8] s
     unsigned char* p
@@ -21,9 +17,13 @@ ctypedef union Utf8Str:
 
 cdef class StringStore:
     cdef Pool mem
+    cdef vector[hash_t] _keys
+    cdef PreshMap _map
+
+    cdef hash_t _intern_str(self, str string)
+    cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *
+    cdef str _decode_str_repr(self, const Utf8Str* string)
 
-    cdef vector[hash_t] keys
-    cdef public PreshMap _map
 
-    cdef const Utf8Str* intern_unicode(self, str py_string)
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
+cpdef hash_t hash_string(object string) except -1
+cpdef hash_t get_string_id(object string_or_hash) except -1
diff --git a/spacy/strings.pyi b/spacy/strings.pyi
index f8fe8381c87..8b7c0d6bd5a 100644
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@@ -1,21 +1,21 @@
+from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overload
 from pathlib import Path
 from typing import Any, Iterable, Iterator, Optional, Union, overload
 
-def get_string_id(key: Union[str, int]) -> int: ...
-
 class StringStore:
-    def __init__(
-        self, strings: Optional[Iterable[str]] = ..., freeze: bool = ...
-    ) -> None: ...
+    def __init__(self, strings: Optional[Iterable[str]]) -> None: ...
     @overload
-    def __getitem__(self, string_or_id: Union[bytes, str]) -> int: ...
+    def __getitem__(self, string_or_hash: str) -> int: ...
     @overload
-    def __getitem__(self, string_or_id: int) -> str: ...
-    def as_int(self, key: Union[bytes, str, int]) -> int: ...
-    def as_string(self, key: Union[bytes, str, int]) -> str: ...
+    def __getitem__(self, string_or_hash: int) -> str: ...
+    def as_int(self, string_or_hash: Union[str, int]) -> int: ...
+    def as_string(self, string_or_hash: Union[str, int]) -> str: ...
     def add(self, string: str) -> int: ...
+    def items(self) -> List[Tuple[str, int]]: ...
+    def keys(self) -> List[str]: ...
+    def values(self) -> List[int]: ...
     def __len__(self) -> int: ...
-    def __contains__(self, string: str) -> bool: ...
+    def __contains__(self, string_or_hash: Union[str, int]) -> bool: ...
     def __iter__(self) -> Iterator[str]: ...
     def __reduce__(self) -> Any: ...
     def to_disk(self, path: Union[str, Path]) -> None: ...
@@ -23,3 +23,5 @@ class StringStore:
     def to_bytes(self, **kwargs: Any) -> bytes: ...
     def from_bytes(self, bytes_data: bytes, **kwargs: Any) -> StringStore: ...
     def _reset_and_load(self, strings: Iterable[str]) -> None: ...
+
+def get_string_id(string_or_hash: Union[str, int]) -> int: ...
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index e73b66dff54..73e4c46ed46 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,9 +1,8 @@
 # cython: infer_types=True
-# cython: profile=False
+from typing import Optional, Union, Iterable, Tuple, Callable, Any, List, Iterator
 cimport cython
 from libc.stdint cimport uint32_t
-from libc.string cimport memcpy
-from murmurhash.mrmr cimport hash32, hash64
+from murmurhash.mrmr cimport hash64
 
 import srsly
 
@@ -15,105 +14,13 @@ from .symbols import IDS as SYMBOLS_BY_STR
 from .symbols import NAMES as SYMBOLS_BY_INT
 
 
-# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
-cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
-    try:
-        out_hash[0] = key
-        return True
-    except:  # no-cython-lint
-        return False
-
-
-def get_string_id(key):
-    """Get a string ID, handling the reserved symbols correctly. If the key is
-    already an ID, return it.
-
-    This function optimises for convenience over performance, so shouldn't be
-    used in tight loops.
-    """
-    cdef hash_t str_hash    
-    if isinstance(key, str):
-        if len(key) == 0:
-            return 0
-
-        symbol = SYMBOLS_BY_STR.get(key, None)
-        if symbol is not None:
-            return symbol
-        else:
-            chars = key.encode("utf8")
-            return hash_utf8(chars, len(chars))
-    elif _try_coerce_to_hash(key, &str_hash):
-        # Coerce the integral key to the expected primitive hash type.
-        # This ensures that custom/overloaded "primitive" data types
-        # such as those implemented by numpy are not inadvertently used 
-        # downsteam (as these are internally implemented as custom PyObjects 
-        # whose comparison operators can incur a significant overhead).
-        return str_hash
-    else:
-        # TODO: Raise an error instead
-        return key
-
-
-cpdef hash_t hash_string(str string) except 0:
-    chars = string.encode("utf8")
-    return hash_utf8(chars, len(chars))
-
-
-cdef hash_t hash_utf8(char* utf8_string, int length) nogil:
-    return hash64(utf8_string, length, 1)
-
-
-cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
-    return hash32(utf8_string, length, 1)
-
-
-cdef str decode_Utf8Str(const Utf8Str* string):
-    cdef int i, length
-    if string.s[0] < sizeof(string.s) and string.s[0] != 0:
-        return string.s[1:string.s[0]+1].decode("utf8")
-    elif string.p[0] < 255:
-        return string.p[1:string.p[0]+1].decode("utf8")
-    else:
-        i = 0
-        length = 0
-        while string.p[i] == 255:
-            i += 1
-            length += 255
-        length += string.p[i]
-        i += 1
-        return string.p[i:length + i].decode("utf8")
-
-
-cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
-    cdef int n_length_bytes
-    cdef int i
-    cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
-    if length < sizeof(string.s):
-        string.s[0] = <unsigned char>length
-        memcpy(&string.s[1], chars, length)
-        return string
-    elif length < 255:
-        string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
-        string.p[0] = length
-        memcpy(&string.p[1], chars, length)
-        return string
-    else:
-        i = 0
-        n_length_bytes = (length // 255) + 1
-        string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
-        for i in range(n_length_bytes-1):
-            string.p[i] = 255
-        string.p[n_length_bytes-1] = length % 255
-        memcpy(&string.p[n_length_bytes], chars, length)
-        return string
-
 
 cdef class StringStore:
-    """Look up strings by 64-bit hashes.
+    """Look up strings by 64-bit hashes. Implicitly handles reserved symbols.
 
     DOCS: https://spacy.io/api/stringstore
     """
-    def __init__(self, strings=None, freeze=False):
+    def __init__(self, strings: Optional[Iterable[str]] = None):
         """Create the StringStore.
 
         strings (iterable): A sequence of unicode strings to add to the store.
@@ -124,127 +31,126 @@ cdef class StringStore:
             for string in strings:
                 self.add(string)
 
-    def __getitem__(self, object string_or_id):
-        """Retrieve a string from a given hash, or vice versa.
+    def __getitem__(self, string_or_hash: Union[str, int]) -> Union[str, int]:
+        """Retrieve a string from a given hash. If a string
+        is passed as the input, add it to the store and return
+        its hash.
 
-        string_or_id (bytes, str or uint64): The value to encode.
-        Returns (str / uint64): The value to be retrieved.
+        string_or_hash (int / str): The hash value to lookup or the string to store.
+        RETURNS (str / int): The stored string or the hash of the newly added string.
         """
-        cdef hash_t str_hash
-        cdef Utf8Str* utf8str = NULL
-
-        if isinstance(string_or_id, str):
-            if len(string_or_id) == 0:
-                return 0
-
-            # Return early if the string is found in the symbols LUT.
-            symbol = SYMBOLS_BY_STR.get(string_or_id, None)
-            if symbol is not None:
-                return symbol
-            else:
-                return hash_string(string_or_id)
-        elif isinstance(string_or_id, bytes):
-            return hash_utf8(string_or_id, len(string_or_id))
-        elif _try_coerce_to_hash(string_or_id, &str_hash):
-            if str_hash == 0:
-                return ""
-            elif str_hash in SYMBOLS_BY_INT:
-                return SYMBOLS_BY_INT[str_hash]
-            else:
-                utf8str = <Utf8Str*>self._map.get(str_hash)
+        if isinstance(string_or_hash, str):
+            return self.add(string_or_hash)
         else:
-            # TODO: Raise an error instead
-            utf8str = <Utf8Str*>self._map.get(string_or_id)
+            return self._get_interned_str(string_or_hash)
 
-        if utf8str is NULL:
-            raise KeyError(Errors.E018.format(hash_value=string_or_id))
-        else:
-            return decode_Utf8Str(utf8str)
+    def __contains__(self, string_or_hash: Union[str, int]) -> bool:
+        """Check whether a string or a hash is in the store.
 
-    def as_int(self, key):
-        """If key is an int, return it; otherwise, get the int value."""
-        if not isinstance(key, str):
-            return key
+        string (str / int): The string/hash to check.
+        RETURNS (bool): Whether the store contains the string.
+        """
+        cdef hash_t str_hash = get_string_id(string_or_hash)
+        if str_hash in SYMBOLS_BY_INT:
+            return True
         else:
-            return self[key]
+            return self._map.get(str_hash) is not NULL
 
-    def as_string(self, key):
-        """If key is a string, return it; otherwise, get the string value."""
-        if isinstance(key, str):
-            return key
-        else:
-            return self[key]
+    def __iter__(self) -> Iterator[str]:
+        """Iterate over the strings in the store in insertion order.
+
+        RETURNS: An iterable collection of strings.
+        """
+        return iter(self.keys())
+
+    def __reduce__(self):
+        strings = list(self)
+        return (StringStore, (strings,), None, None, None)
+
+    def __len__(self) -> int:
+        """The number of strings in the store.
+
+        RETURNS (int): The number of strings in the store.
+        """
+        return self._keys.size()
 
-    def add(self, string):
+    def add(self, string: str) -> int:
         """Add a string to the StringStore.
 
         string (str): The string to add.
         RETURNS (uint64): The string's hash value.
         """
-        cdef hash_t str_hash
-        if isinstance(string, str):
-            if string in SYMBOLS_BY_STR:
-                return SYMBOLS_BY_STR[string]
-
-            string = string.encode("utf8")
-            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
-        elif isinstance(string, bytes):
-            if string in SYMBOLS_BY_STR:
-                return SYMBOLS_BY_STR[string]
-            str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
-        else:
+        if not isinstance(string, str):
             raise TypeError(Errors.E017.format(value_type=type(string)))
-        return str_hash
 
-    def __len__(self):
-        """The number of strings in the store.
+        if string in SYMBOLS_BY_STR:
+            return SYMBOLS_BY_STR[string]
+        else:
+            return self._intern_str(string)
 
-        RETURNS (int): The number of strings in the store.
+    def as_int(self, string_or_hash: Union[str, int]) -> str:
+        """If a hash value is passed as the input, return it as-is. If the input
+        is a string, return its corresponding hash.
+
+        string_or_hash (str / int): The string to hash or a hash value.
+        RETURNS (int): The hash of the string or the input hash value.
         """
-        return self.keys.size()
+        if isinstance(string_or_hash, int):
+            return string_or_hash
+        else:
+            return get_string_id(string_or_hash)
 
-    def __contains__(self, string_or_id not None):
-        """Check whether a string or ID is in the store.
+    def as_string(self, string_or_hash: Union[str, int]) -> str:
+        """If a string is passed as the input, return it as-is. If the input
+        is a hash value, return its corresponding string.
 
-        string_or_id (str or int): The string to check.
-        RETURNS (bool): Whether the store contains the string.
+        string_or_hash (str / int): The hash value to lookup or a string.
+        RETURNS (str): The stored string or the input string.
         """
-        cdef hash_t str_hash
-        if isinstance(string_or_id, str):
-            if len(string_or_id) == 0:
-                return True
-            elif string_or_id in SYMBOLS_BY_STR:
-                return True
-            str_hash = hash_string(string_or_id)
-        elif _try_coerce_to_hash(string_or_id, &str_hash):
-            pass
+        if isinstance(string_or_hash, str):
+            return string_or_hash
         else:
-            # TODO: Raise an error instead
-            return self._map.get(string_or_id) is not NULL
+            return self._get_interned_str(string_or_hash)
 
-        if str_hash in SYMBOLS_BY_INT:
-            return True
-        else:
-            return self._map.get(str_hash) is not NULL
+    def items(self) -> List[Tuple[str, int]]:
+        """Iterate over the stored strings and their hashes in insertion order.
 
-    def __iter__(self):
-        """Iterate over the strings in the store, in order.
+        RETURNS: A list of string-hash pairs.
+        """
+        # Even though we internally store the hashes as keys and the strings as
+        # values, we invert the order in the public API to keep it consistent with
+        # the implementation of the `__iter__` method (where we wish to iterate over
+        # the strings in the store).
+        cdef int i
+        pairs = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            str_hash = self._keys[i]
+            utf8str = <Utf8Str*>self._map.get(str_hash)
+            pairs[i] = (self._decode_str_repr(utf8str), str_hash)
+        return pairs
+
+    def keys(self) -> List[str]:
+        """Iterate over the stored strings in insertion order.
 
-        YIELDS (str): A string in the store.
+        RETURNS: A list of strings.
         """
         cdef int i
-        cdef hash_t key
-        for i in range(self.keys.size()):
-            key = self.keys[i]
-            utf8str = <Utf8Str*>self._map.get(key)
-            yield decode_Utf8Str(utf8str)
-        # TODO: Iterate OOV here?
+        strings = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            utf8str = <Utf8Str*>self._map.get(self._keys[i])
+            strings[i] = self._decode_str_repr(utf8str)
+        return strings
 
-    def __reduce__(self):
-        strings = list(self)
-        return (StringStore, (strings,), None, None, None)
+    def values(self) -> List[int]:
+        """Iterate over the stored strings hashes in insertion order.
+
+        RETURNS: A list of string hashs.
+        """
+        cdef int i
+        hashes = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            hashes[i] = self._keys[i]
+        return hashes
 
     def to_disk(self, path):
         """Save the current state to a directory.
@@ -295,24 +201,122 @@ cdef class StringStore:
     def _reset_and_load(self, strings):
         self.mem = Pool()
         self._map = PreshMap()
-        self.keys.clear()
+        self._keys.clear()
         for string in strings:
             self.add(string)
 
-    cdef const Utf8Str* intern_unicode(self, str py_string):
-        # 0 means missing, but we don't bother offsetting the index.
-        cdef bytes byte_string = py_string.encode("utf8")
-        return self._intern_utf8(byte_string, len(byte_string), NULL)
+    def _get_interned_str(self, hash_value: int) -> str:
+        cdef hash_t str_hash
+        if not _try_coerce_to_hash(hash_value, &str_hash):
+            raise TypeError(Errors.E4001.format(expected_types="'int'", received_type=type(hash_value)))
+
+        # Handle reserved symbols and empty strings correctly.
+        if str_hash == 0:
+            return ""
 
-    @cython.final
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
+        symbol = SYMBOLS_BY_INT.get(str_hash)
+        if symbol is not None:
+            return symbol
+
+        utf8str = <Utf8Str*>self._map.get(str_hash)
+        if utf8str is NULL:
+            raise KeyError(Errors.E018.format(hash_value=str_hash))
+        else:
+            return self._decode_str_repr(utf8str)
+
+    cdef hash_t _intern_str(self, str string):
         # TODO: This function's API/behaviour is an unholy mess...
         # 0 means missing, but we don't bother offsetting the index.
-        cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
+        chars = string.encode('utf-8')
+        cdef hash_t key = hash64(<unsigned char*>chars, len(chars), 1)
         cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
         if value is not NULL:
-            return value
-        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
+            return key
+
+        value = self._allocate_str_repr(<unsigned char*>chars, len(chars))
         self._map.set(key, value)
-        self.keys.push_back(key)
-        return value
+        self._keys.push_back(key)
+        return key
+
+    cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *:
+        cdef int n_length_bytes
+        cdef int i
+        cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
+        cdef uint32_t ulength = length
+        if length < sizeof(string.s):
+            string.s[0] = <unsigned char>length
+            memcpy(&string.s[1], chars, length)
+            return string
+        elif length < 255:
+            string.p = <unsigned char*>self.mem.alloc(length + 1, sizeof(unsigned char))
+            string.p[0] = length
+            memcpy(&string.p[1], chars, length)
+            return string
+        else:
+            i = 0
+            n_length_bytes = (length // 255) + 1
+            string.p = <unsigned char*>self.mem.alloc(length + n_length_bytes, sizeof(unsigned char))
+            for i in range(n_length_bytes-1):
+                string.p[i] = 255
+            string.p[n_length_bytes-1] = length % 255
+            memcpy(&string.p[n_length_bytes], chars, length)
+            return string
+
+    cdef str _decode_str_repr(self, const Utf8Str* string):
+        cdef int i, length
+        if string.s[0] < sizeof(string.s) and string.s[0] != 0:
+            return string.s[1:string.s[0]+1].decode('utf-8')
+        elif string.p[0] < 255:
+            return string.p[1:string.p[0]+1].decode('utf-8')
+        else:
+            i = 0
+            length = 0
+            while string.p[i] == 255:
+                i += 1
+                length += 255
+            length += string.p[i]
+            i += 1
+            return string.p[i:length + i].decode('utf-8')
+
+
+cpdef hash_t hash_string(object string) except -1:
+    if not isinstance(string, str):
+        raise TypeError(Errors.E4001.format(expected_types="'str'", received_type=type(string)))
+
+    # Handle reserved symbols and empty strings correctly.
+    if len(string) == 0:
+        return 0
+
+    symbol = SYMBOLS_BY_STR.get(string)
+    if symbol is not None:
+        return symbol
+
+    chars = string.encode('utf-8')
+    return hash64(<unsigned char*>chars, len(chars), 1)
+
+
+cpdef hash_t get_string_id(object string_or_hash) except -1:
+    cdef hash_t str_hash
+
+    try:
+        return hash_string(string_or_hash)
+    except:
+        if _try_coerce_to_hash(string_or_hash, &str_hash):
+            # Coerce the integral key to the expected primitive hash type.
+            # This ensures that custom/overloaded "primitive" data types
+            # such as those implemented by numpy are not inadvertently used
+            # downsteam (as these are internally implemented as custom PyObjects
+            # whose comparison operators can incur a significant overhead).
+            return str_hash
+        else:
+            raise TypeError(Errors.E4001.format(expected_types="'str','int'", received_type=type(string_or_hash)))
+
+
+# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)`
+cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
+    try:
+        out_hash[0] = key
+        return True
+    except:
+        return False
+
diff --git a/spacy/tests/vocab_vectors/test_stringstore.py b/spacy/tests/vocab_vectors/test_stringstore.py
index 61039fffd4c..68c307939d3 100644
--- a/spacy/tests/vocab_vectors/test_stringstore.py
+++ b/spacy/tests/vocab_vectors/test_stringstore.py
@@ -25,6 +25,14 @@ def test_stringstore_from_api_docs(stringstore):
     stringstore.add("orange")
     all_strings = [s for s in stringstore]
     assert all_strings == ["apple", "orange"]
+    assert all_strings == list(stringstore.keys())
+    all_strings_and_hashes = list(stringstore.items())
+    assert all_strings_and_hashes == [
+        ("apple", 8566208034543834098),
+        ("orange", 2208928596161743350),
+    ]
+    all_hashes = list(stringstore.values())
+    assert all_hashes == [8566208034543834098, 2208928596161743350]
     banana_hash = stringstore.add("banana")
     assert len(stringstore) == 3
     assert banana_hash == 2525716904149915114
@@ -32,12 +40,25 @@ def test_stringstore_from_api_docs(stringstore):
     assert stringstore["banana"] == banana_hash
 
 
-@pytest.mark.parametrize("text1,text2,text3", [(b"Hello", b"goodbye", b"hello")])
-def test_stringstore_save_bytes(stringstore, text1, text2, text3):
-    key = stringstore.add(text1)
-    assert stringstore[text1] == key
-    assert stringstore[text2] != key
-    assert stringstore[text3] != key
+@pytest.mark.parametrize(
+    "val_bytes,val_float,val_list,val_text,val_hash",
+    [(b"Hello", 1.1, ["abc"], "apple", 8566208034543834098)],
+)
+def test_stringstore_type_checking(
+    stringstore, val_bytes, val_float, val_list, val_text, val_hash
+):
+    with pytest.raises(TypeError):
+        assert stringstore[val_bytes]
+
+    with pytest.raises(TypeError):
+        stringstore.add(val_float)
+
+    with pytest.raises(TypeError):
+        assert val_list not in stringstore
+
+    key = stringstore.add(val_text)
+    assert val_hash == key
+    assert stringstore[val_hash] == val_text
 
 
 @pytest.mark.parametrize("text1,text2,text3", [("Hello", "goodbye", "hello")])
@@ -48,19 +69,19 @@ def test_stringstore_save_unicode(stringstore, text1, text2, text3):
     assert stringstore[text3] != key
 
 
-@pytest.mark.parametrize("text", [b"A"])
+@pytest.mark.parametrize("text", ["A"])
 def test_stringstore_retrieve_id(stringstore, text):
     key = stringstore.add(text)
     assert len(stringstore) == 1
-    assert stringstore[key] == text.decode("utf8")
+    assert stringstore[key] == text
     with pytest.raises(KeyError):
         stringstore[20000]
 
 
-@pytest.mark.parametrize("text1,text2", [(b"0123456789", b"A")])
+@pytest.mark.parametrize("text1,text2", [("0123456789", "A")])
 def test_stringstore_med_string(stringstore, text1, text2):
     store = stringstore.add(text1)
-    assert stringstore[store] == text1.decode("utf8")
+    assert stringstore[store] == text1
     stringstore.add(text2)
     assert stringstore[text1] == store
 
diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx
index 6c4ce6ce358..22ce18181a7 100644
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@@ -16,9 +16,7 @@ from murmurhash.mrmr cimport hash64
 from .. import Errors
 
 from ..typedefs cimport hash_t
-
-from ..strings import get_string_id
-
+from ..strings cimport get_string_id
 from ..structs cimport EdgeC, GraphC
 
 from .token import Token
diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx
index b0e4ff85c9f..d3e9c5674cc 100644
--- a/spacy/tokens/retokenizer.pyx
+++ b/spacy/tokens/retokenizer.pyx
@@ -15,9 +15,7 @@ from .token cimport Token
 
 from ..attrs import intify_attrs
 from ..errors import Errors
-from ..strings import get_string_id
-from ..util import SimpleFrozenDict
-from .underscore import is_writable_attr
+from ..strings cimport get_string_id
 
 
 cdef class Retokenizer:
diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index 6a3e9d6644e..d4d85e6d56a 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -47,7 +47,8 @@ Get the number of strings in the store.
 
 ## StringStore.\_\_getitem\_\_ {id="getitem",tag="method"}
 
-Retrieve a string from a given hash, or vice versa.
+Retrieve a string from a given hash. If a string is passed as the input, add it
+to the store and return its hash.
 
 > #### Example
 >
@@ -58,14 +59,14 @@ Retrieve a string from a given hash, or vice versa.
 > assert stringstore[apple_hash] == "apple"
 > ```
 
-| Name           | Description                                     |
-| -------------- | ----------------------------------------------- |
-| `string_or_id` | The value to encode. ~~Union[bytes, str, int]~~ |
-| **RETURNS**    | The value to be retrieved. ~~Union[str, int]~~  |
+| Name             | Description                                                                  |
+| ---------------- | ---------------------------------------------------------------------------- |
+| `string_or_hash` | The hash value to lookup or the string to store. ~~Union[str, int]~~         |
+| **RETURNS**      | The stored string or the hash of the newly added string. ~~Union[str, int]~~ |
 
 ## StringStore.\_\_contains\_\_ {id="contains",tag="method"}
 
-Check whether a string is in the store.
+Check whether a string or a hash is in the store.
 
 > #### Example
 >
@@ -75,15 +76,14 @@ Check whether a string is in the store.
 > assert not "cherry" in stringstore
 > ```
 
-| Name        | Description                                     |
-| ----------- | ----------------------------------------------- |
-| `string`    | The string to check. ~~str~~                    |
-| **RETURNS** | Whether the store contains the string. ~~bool~~ |
+| Name             | Description                                             |
+| ---------------- | ------------------------------------------------------- |
+| `string_or_hash` | The string or hash to check. ~~Union[str, int]~~        |
+| **RETURNS**      | Whether the store contains the string or hash. ~~bool~~ |
 
 ## StringStore.\_\_iter\_\_ {id="iter",tag="method"}
 
-Iterate over the strings in the store, in order. Note that a newly initialized
-store will always include an empty string `""` at position `0`.
+Iterate over the stored strings in insertion order.
 
 > #### Example
 >
@@ -93,11 +93,59 @@ store will always include an empty string `""` at position `0`.
 > assert all_strings == ["apple", "orange"]
 > ```
 
-| Name       | Description                    |
-| ---------- | ------------------------------ |
-| **YIELDS** | A string in the store. ~~str~~ |
+| Name        | Description                    |
+| ----------- | ------------------------------ |
+| **RETURNS** | A string in the store. ~~str~~ |
 
-## StringStore.add {id="add",tag="method",version="2"}
+## StringStore.items {#iter tag="method" new="4"}
+
+Iterate over the stored string-hash pairs in insertion order.
+
+> #### Example
+>
+> ```python
+> stringstore = StringStore(["apple", "orange"])
+> all_strings_and_hashes = stringstore.items()
+> assert all_strings_and_hashes == [("apple", 8566208034543834098), ("orange", 2208928596161743350)]
+> ```
+
+| Name        | Description                                            |
+| ----------- | ------------------------------------------------------ |
+| **RETURNS** | A list of string-hash pairs. ~~List[Tuple[str, int]]~~ |
+
+## StringStore.keys {#iter tag="method" new="4"}
+
+Iterate over the stored strings in insertion order.
+
+> #### Example
+>
+> ```python
+> stringstore = StringStore(["apple", "orange"])
+> all_strings = stringstore.keys()
+> assert all_strings == ["apple", "orange"]
+> ```
+
+| Name        | Description                      |
+| ----------- | -------------------------------- |
+| **RETURNS** | A list of strings. ~~List[str]~~ |
+
+## StringStore.values {#iter tag="method" new="4"}
+
+Iterate over the stored string hashes in insertion order.
+
+> #### Example
+>
+> ```python
+> stringstore = StringStore(["apple", "orange"])
+> all_hashes = stringstore.values()
+> assert all_hashes == [8566208034543834098, 2208928596161743350]
+> ```
+
+| Name        | Description                            |
+| ----------- | -------------------------------------- |
+| **RETURNS** | A list of string hashes. ~~List[int]~~ |
+
+## StringStore.add {#add tag="method"}
 
 Add a string to the `StringStore`.
 
@@ -117,7 +165,7 @@ Add a string to the `StringStore`.
 | `string`    | The string to add. ~~str~~       |
 | **RETURNS** | The string's hash value. ~~int~~ |
 
-## StringStore.to_disk {id="to_disk",tag="method",version="2"}
+## StringStore.to_disk {#to_disk tag="method"}
 
 Save the current state to a directory.
 

From ab23880d4e4bc09b3f17d4124a79fe0ea49cdbfa Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 21 Oct 2022 18:01:18 +0900
Subject: [PATCH 241/504] Remove thinc util reimports (#11665)

* Remove imports marked as v2 leftovers

There are a few functions that were in `spacy.util` in v2, but were
moved to Thinc. In v3 these were imported in `spacy.util` so that code
could be used unchanged, but the comment over them indicates they should
always be imported from Thinc. This commit removes those imports.

It doesn't look like any DeprecationWarning was ever thrown for using
these, but it is probably fine to remove them anyway with a major
version. It is not clear that they were widely used.

* Import fix_random_seed correctly

This seems to be the only place in spaCy that was using the old import.
---
 spacy/tests/pipeline/test_spancat.py | 7 +++----
 spacy/util.py                        | 8 +++-----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 9678e9b63b8..5dcc2e70f67 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -1,7 +1,6 @@
 import numpy
-import pytest
-from numpy.testing import assert_almost_equal, assert_array_equal
-from thinc.api import NumpyOps, Ragged, get_current_ops
+from numpy.testing import assert_array_equal, assert_almost_equal
+from thinc.api import get_current_ops, Ragged, fix_random_seed
 
 from spacy import util
 from spacy.lang.en import English
@@ -9,7 +8,7 @@
 from spacy.tokens import SpanGroup
 from spacy.tokens.span_groups import SpanGroups
 from spacy.training import Example
-from spacy.util import fix_random_seed, make_tempdir, registry
+from spacy.util import registry, make_tempdir
 
 OPS = get_current_ops()
 
diff --git a/spacy/util.py b/spacy/util.py
index c127be03c37..8068c4bcec9 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -63,12 +63,10 @@
 except ImportError:
     cupy = None
 
-# These are functions that were previously (v2.x) available from spacy.util
-# and have since moved to Thinc. We're importing them here so people's code
-# doesn't break, but they should always be imported from Thinc from now on,
-# not from spacy.util.
-from thinc.api import compounding, decaying, fix_random_seed  # noqa: F401
 
+from .symbols import ORTH
+from .compat import cupy, CudaStream, is_windows, importlib_metadata
+from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
 from . import about
 from .compat import CudaStream, cupy, importlib_metadata, is_windows
 from .errors import OLD_MODEL_SHORTCUTS, Errors, Warnings

From ff964e24133c3310123daabb7ad67cf2d7db3f13 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 24 Oct 2022 09:11:35 +0200
Subject: [PATCH 242/504] Replace EntityRuler with SpanRuler implementation
 (#11320)

* Replace EntityRuler with SpanRuler implementation

Remove `EntityRuler` and rename the `SpanRuler`-based
`future_entity_ruler` to `entity_ruler`.

Main changes:

* It is no longer possible to load patterns on init as with
`EntityRuler(patterns=)`.
* The older serialization formats (`patterns.jsonl`) are no longer
supported and the related tests are removed.
* The config settings are only stored in the config, not in the
serialized component (in particular the `phrase_matcher_attr` and
overwrite settings).

* Add migration guide to EntityRuler API docs

* docs update

* Minor edit

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/errors.py                               |   8 +-
 spacy/pipeline/__init__.py                    |   2 -
 spacy/pipeline/entity_ruler.py                | 541 ------------------
 spacy/pipeline/span_ruler.py                  |  23 +-
 spacy/tests/matcher/test_phrase_matcher.py    |   9 +-
 spacy/tests/pipeline/test_entity_ruler.py     | 259 +++------
 .../serialize/test_serialize_pipeline.py      |  67 +--
 website/docs/api/entityruler.mdx              | 311 ++--------
 website/docs/api/spanruler.mdx                |  13 +-
 website/docs/usage/101/_architecture.mdx      |  40 +-
 website/docs/usage/101/_pipelines.mdx         |   6 +-
 website/docs/usage/processing-pipelines.mdx   |   5 +-
 website/docs/usage/rule-based-matching.mdx    |  43 +-
 website/docs/usage/saving-loading.mdx         |  10 +-
 website/docs/usage/training.mdx               |   2 +-
 15 files changed, 245 insertions(+), 1094 deletions(-)
 delete mode 100644 spacy/pipeline/entity_ruler.py

diff --git a/spacy/errors.py b/spacy/errors.py
index 9814679eb7d..965c92066bc 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -470,13 +470,13 @@ class Errors(metaclass=ErrorsWithCodes):
             "same, but found '{nlp}' and '{vocab}' respectively.")
     E152 = ("The attribute {attr} is not supported for token patterns. "
             "Please use the option `validate=True` with the Matcher, PhraseMatcher, "
-            "EntityRuler or AttributeRuler for more details.")
+            "SpanRuler or AttributeRuler for more details.")
     E153 = ("The value type {vtype} is not supported for token patterns. "
             "Please use the option validate=True with Matcher, PhraseMatcher, "
-            "EntityRuler or AttributeRuler for more details.")
+            "SpanRuler or AttributeRuler for more details.")
     E154 = ("One of the attributes or values is not supported for token "
             "patterns. Please use the option `validate=True` with the Matcher, "
-            "PhraseMatcher, or EntityRuler for more details.")
+            "PhraseMatcher, or SpanRuler for more details.")
     E155 = ("The pipeline needs to include a {pipe} in order to use "
             "Matcher or PhraseMatcher with the attribute {attr}. "
             "Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` "
@@ -933,8 +933,6 @@ class Errors(metaclass=ErrorsWithCodes):
     E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
              "Non-UD tags should use the `tag` property.")
     E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
-    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
-             "exist.")
     E1024 = ("A pattern with {attr_type} '{label}' is not present in "
              "'{component}' patterns.")
     E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 82d24486a27..e26f7436efa 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -3,7 +3,6 @@
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
 from .ner import EntityRecognizer
-from .entity_ruler import EntityRuler
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .ner import EntityRecognizer
@@ -25,7 +24,6 @@
     "EditTreeLemmatizer",
     "EntityLinker",
     "EntityRecognizer",
-    "EntityRuler",
     "Morphologizer",
     "Lemmatizer",
     "MultiLabel_TextCategorizer",
diff --git a/spacy/pipeline/entity_ruler.py b/spacy/pipeline/entity_ruler.py
deleted file mode 100644
index 3683cfc0270..00000000000
--- a/spacy/pipeline/entity_ruler.py
+++ /dev/null
@@ -1,541 +0,0 @@
-import warnings
-from collections import defaultdict
-from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
-
-import srsly
-
-from ..errors import Errors, Warnings
-from ..language import Language
-from ..matcher import Matcher, PhraseMatcher
-from ..matcher.levenshtein import levenshtein_compare
-from ..scorer import get_ner_prf
-from ..tokens import Doc, Span
-from ..training import Example
-from ..util import SimpleFrozenList, ensure_path, from_disk, registry, to_disk
-from .pipe import Pipe
-
-DEFAULT_ENT_ID_SEP = "||"
-PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
-
-
-@Language.factory(
-    "entity_ruler",
-    assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
-    default_config={
-        "phrase_matcher_attr": None,
-        "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
-        "validate": False,
-        "overwrite_ents": False,
-        "ent_id_sep": DEFAULT_ENT_ID_SEP,
-        "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
-    },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
-)
-def make_entity_ruler(
-    nlp: Language,
-    name: str,
-    phrase_matcher_attr: Optional[Union[int, str]],
-    matcher_fuzzy_compare: Callable,
-    validate: bool,
-    overwrite_ents: bool,
-    ent_id_sep: str,
-    scorer: Optional[Callable],
-):
-    return EntityRuler(
-        nlp,
-        name,
-        phrase_matcher_attr=phrase_matcher_attr,
-        matcher_fuzzy_compare=matcher_fuzzy_compare,
-        validate=validate,
-        overwrite_ents=overwrite_ents,
-        ent_id_sep=ent_id_sep,
-        scorer=scorer,
-    )
-
-
-def entity_ruler_score(examples, **kwargs):
-    return get_ner_prf(examples)
-
-
-@registry.scorers("spacy.entity_ruler_scorer.v1")
-def make_entity_ruler_scorer():
-    return entity_ruler_score
-
-
-class EntityRuler(Pipe):
-    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
-    rules or exact phrase matches. It can be combined with the statistical
-    `EntityRecognizer` to boost accuracy, or used on its own to implement a
-    purely rule-based entity recognition system. After initialization, the
-    component is typically added to the pipeline using `nlp.add_pipe`.
-
-    DOCS: https://spacy.io/api/entityruler
-    USAGE: https://spacy.io/usage/rule-based-matching#entityruler
-    """
-
-    def __init__(
-        self,
-        nlp: Language,
-        name: str = "entity_ruler",
-        *,
-        phrase_matcher_attr: Optional[Union[int, str]] = None,
-        matcher_fuzzy_compare: Callable = levenshtein_compare,
-        validate: bool = False,
-        overwrite_ents: bool = False,
-        ent_id_sep: str = DEFAULT_ENT_ID_SEP,
-        patterns: Optional[List[PatternType]] = None,
-        scorer: Optional[Callable] = entity_ruler_score,
-    ) -> None:
-        """Initialize the entity ruler. If patterns are supplied here, they
-        need to be a list of dictionaries with a `"label"` and `"pattern"`
-        key. A pattern can either be a token pattern (list) or a phrase pattern
-        (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
-
-        nlp (Language): The shared nlp object to pass the vocab to the matchers
-            and process phrase patterns.
-        name (str): Instance name of the current pipeline component. Typically
-            passed in automatically from the factory when the component is
-            added. Used to disable the current entity ruler while creating
-            phrase patterns with the nlp object.
-        phrase_matcher_attr (int / str): Token attribute to match on, passed
-            to the internal PhraseMatcher as `attr`.
-        matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
-            internal Matcher. Defaults to
-            spacy.matcher.levenshtein.levenshtein_compare.
-        validate (bool): Whether patterns should be validated, passed to
-            Matcher and PhraseMatcher as `validate`
-        patterns (iterable): Optional patterns to load in.
-        overwrite_ents (bool): If existing entities are present, e.g. entities
-            added by the model, overwrite them by matches if necessary.
-        ent_id_sep (str): Separator used internally for entity IDs.
-        scorer (Optional[Callable]): The scoring method. Defaults to
-            spacy.scorer.get_ner_prf.
-
-        DOCS: https://spacy.io/api/entityruler#init
-        """
-        self.nlp = nlp
-        self.name = name
-        self.overwrite = overwrite_ents
-        self.token_patterns = defaultdict(list)  # type: ignore
-        self.phrase_patterns = defaultdict(list)  # type: ignore
-        self._validate = validate
-        self.matcher_fuzzy_compare = matcher_fuzzy_compare
-        self.matcher = Matcher(
-            nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare
-        )
-        self.phrase_matcher_attr = phrase_matcher_attr
-        self.phrase_matcher = PhraseMatcher(
-            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
-        )
-        self.ent_id_sep = ent_id_sep
-        self._ent_ids = defaultdict(tuple)  # type: ignore
-        if patterns is not None:
-            self.add_patterns(patterns)
-        self.scorer = scorer
-
-    def __len__(self) -> int:
-        """The number of all patterns added to the entity ruler."""
-        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
-        n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
-        return n_token_patterns + n_phrase_patterns
-
-    def __contains__(self, label: str) -> bool:
-        """Whether a label is present in the patterns."""
-        return label in self.token_patterns or label in self.phrase_patterns
-
-    def __call__(self, doc: Doc) -> Doc:
-        """Find matches in document and add them as entities.
-
-        doc (Doc): The Doc object in the pipeline.
-        RETURNS (Doc): The Doc with added entities, if available.
-
-        DOCS: https://spacy.io/api/entityruler#call
-        """
-        error_handler = self.get_error_handler()
-        try:
-            matches = self.match(doc)
-            self.set_annotations(doc, matches)
-            return doc
-        except Exception as e:
-            return error_handler(self.name, self, [doc], e)
-
-    def match(self, doc: Doc):
-        self._require_patterns()
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", message="\\[W036")
-            matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
-
-        final_matches = set(
-            [(m_id, start, end) for m_id, start, end in matches if start != end]
-        )
-        get_sort_key = lambda m: (m[2] - m[1], -m[1])
-        final_matches = sorted(final_matches, key=get_sort_key, reverse=True)
-        return final_matches
-
-    def set_annotations(self, doc, matches):
-        """Modify the document in place"""
-        entities = list(doc.ents)
-        new_entities = []
-        seen_tokens = set()
-        for match_id, start, end in matches:
-            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
-                continue
-            # check for end - 1 here because boundaries are inclusive
-            if start not in seen_tokens and end - 1 not in seen_tokens:
-                if match_id in self._ent_ids:
-                    label, ent_id = self._ent_ids[match_id]
-                    span = Span(doc, start, end, label=label, span_id=ent_id)
-                else:
-                    span = Span(doc, start, end, label=match_id)
-                new_entities.append(span)
-                entities = [
-                    e for e in entities if not (e.start < end and e.end > start)
-                ]
-                seen_tokens.update(range(start, end))
-        doc.ents = entities + new_entities
-
-    @property
-    def labels(self) -> Tuple[str, ...]:
-        """All labels present in the match patterns.
-
-        RETURNS (set): The string labels.
-
-        DOCS: https://spacy.io/api/entityruler#labels
-        """
-        keys = set(self.token_patterns.keys())
-        keys.update(self.phrase_patterns.keys())
-        all_labels = set()
-
-        for l in keys:
-            if self.ent_id_sep in l:
-                label, _ = self._split_label(l)
-                all_labels.add(label)
-            else:
-                all_labels.add(l)
-        return tuple(sorted(all_labels))
-
-    def initialize(
-        self,
-        get_examples: Callable[[], Iterable[Example]],
-        *,
-        nlp: Optional[Language] = None,
-        patterns: Optional[Sequence[PatternType]] = None,
-    ):
-        """Initialize the pipe for training.
-
-        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns a representative sample of gold-standard Example objects.
-        nlp (Language): The current nlp object the component is part of.
-        patterns Optional[Iterable[PatternType]]: The list of patterns.
-
-        DOCS: https://spacy.io/api/entityruler#initialize
-        """
-        self.clear()
-        if patterns:
-            self.add_patterns(patterns)  # type: ignore[arg-type]
-
-    @property
-    def ent_ids(self) -> Tuple[Optional[str], ...]:
-        """All entity ids present in the match patterns `id` properties
-
-        RETURNS (set): The string entity ids.
-
-        DOCS: https://spacy.io/api/entityruler#ent_ids
-        """
-        keys = set(self.token_patterns.keys())
-        keys.update(self.phrase_patterns.keys())
-        all_ent_ids = set()
-
-        for l in keys:
-            if self.ent_id_sep in l:
-                _, ent_id = self._split_label(l)
-                all_ent_ids.add(ent_id)
-        return tuple(all_ent_ids)
-
-    @property
-    def patterns(self) -> List[PatternType]:
-        """Get all patterns that were added to the entity ruler.
-
-        RETURNS (list): The original patterns, one dictionary per pattern.
-
-        DOCS: https://spacy.io/api/entityruler#patterns
-        """
-        all_patterns = []
-        for label, patterns in self.token_patterns.items():
-            for pattern in patterns:
-                ent_label, ent_id = self._split_label(label)
-                p = {"label": ent_label, "pattern": pattern}
-                if ent_id:
-                    p["id"] = ent_id
-                all_patterns.append(p)
-        for label, patterns in self.phrase_patterns.items():
-            for pattern in patterns:
-                ent_label, ent_id = self._split_label(label)
-                p = {"label": ent_label, "pattern": pattern.text}
-                if ent_id:
-                    p["id"] = ent_id
-                all_patterns.append(p)
-        return all_patterns
-
-    def add_patterns(self, patterns: List[PatternType]) -> None:
-        """Add patterns to the entity ruler. A pattern can either be a token
-        pattern (list of dicts) or a phrase pattern (string). For example:
-        {'label': 'ORG', 'pattern': 'Apple'}
-        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
-
-        patterns (list): The patterns to add.
-
-        DOCS: https://spacy.io/api/entityruler#add_patterns
-        """
-
-        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
-        try:
-            current_index = -1
-            for i, (name, pipe) in enumerate(self.nlp.pipeline):
-                if self == pipe:
-                    current_index = i
-                    break
-            subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
-        except ValueError:
-            subsequent_pipes = []
-        with self.nlp.select_pipes(disable=subsequent_pipes):
-            token_patterns = []
-            phrase_pattern_labels = []
-            phrase_pattern_texts = []
-            phrase_pattern_ids = []
-            for entry in patterns:
-                if isinstance(entry["pattern"], str):
-                    phrase_pattern_labels.append(entry["label"])
-                    phrase_pattern_texts.append(entry["pattern"])
-                    phrase_pattern_ids.append(entry.get("id"))
-                elif isinstance(entry["pattern"], list):
-                    token_patterns.append(entry)
-            phrase_patterns = []
-            for label, pattern, ent_id in zip(
-                phrase_pattern_labels,
-                self.nlp.pipe(phrase_pattern_texts),
-                phrase_pattern_ids,
-            ):
-                phrase_pattern = {"label": label, "pattern": pattern}
-                if ent_id:
-                    phrase_pattern["id"] = ent_id
-                phrase_patterns.append(phrase_pattern)
-            for entry in token_patterns + phrase_patterns:  # type: ignore[operator]
-                label = entry["label"]  # type: ignore
-                if "id" in entry:
-                    ent_label = label
-                    label = self._create_label(label, entry["id"])
-                    key = self.matcher._normalize_key(label)
-                    self._ent_ids[key] = (ent_label, entry["id"])
-                pattern = entry["pattern"]  # type: ignore
-                if isinstance(pattern, Doc):
-                    self.phrase_patterns[label].append(pattern)
-                    self.phrase_matcher.add(label, [pattern])  # type: ignore
-                elif isinstance(pattern, list):
-                    self.token_patterns[label].append(pattern)
-                    self.matcher.add(label, [pattern])
-                else:
-                    raise ValueError(Errors.E097.format(pattern=pattern))
-
-    def clear(self) -> None:
-        """Reset all patterns."""
-        self.token_patterns = defaultdict(list)
-        self.phrase_patterns = defaultdict(list)
-        self._ent_ids = defaultdict(tuple)
-        self.matcher = Matcher(
-            self.nlp.vocab,
-            validate=self._validate,
-            fuzzy_compare=self.matcher_fuzzy_compare,
-        )
-        self.phrase_matcher = PhraseMatcher(
-            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
-        )
-
-    def remove(self, ent_id: str) -> None:
-        """Remove a pattern by its ent_id if a pattern with this ent_id was added before
-
-        ent_id (str): id of the pattern to be removed
-        RETURNS: None
-        DOCS: https://spacy.io/api/entityruler#remove
-        """
-        label_id_pairs = [
-            (label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id
-        ]
-        if not label_id_pairs:
-            raise ValueError(
-                Errors.E1024.format(attr_type="ID", label=ent_id, component=self.name)
-            )
-        created_labels = [
-            self._create_label(label, eid) for (label, eid) in label_id_pairs
-        ]
-        # remove the patterns from self.phrase_patterns
-        self.phrase_patterns = defaultdict(
-            list,
-            {
-                label: val
-                for (label, val) in self.phrase_patterns.items()
-                if label not in created_labels
-            },
-        )
-        # remove the patterns from self.token_pattern
-        self.token_patterns = defaultdict(
-            list,
-            {
-                label: val
-                for (label, val) in self.token_patterns.items()
-                if label not in created_labels
-            },
-        )
-        # remove the patterns from self.token_pattern
-        for label in created_labels:
-            if label in self.phrase_matcher:
-                self.phrase_matcher.remove(label)
-            else:
-                self.matcher.remove(label)
-
-    def _require_patterns(self) -> None:
-        """Raise a warning if this component has no patterns defined."""
-        if len(self) == 0:
-            warnings.warn(Warnings.W036.format(name=self.name))
-
-    def _split_label(self, label: str) -> Tuple[str, Optional[str]]:
-        """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
-
-        label (str): The value of label in a pattern entry
-        RETURNS (tuple): ent_label, ent_id
-        """
-        if self.ent_id_sep in label:
-            ent_label, ent_id = label.rsplit(self.ent_id_sep, 1)
-        else:
-            ent_label = label
-            ent_id = None  # type: ignore
-        return ent_label, ent_id
-
-    def _create_label(self, label: Any, ent_id: Any) -> str:
-        """Join Entity label with ent_id if the pattern has an `id` attribute
-        If ent_id is not a string, the label is returned as is.
-
-        label (str): The label to set for ent.label_
-        ent_id (str): The label
-        RETURNS (str): The ent_label joined with configured `ent_id_sep`
-        """
-        if isinstance(ent_id, str):
-            label = f"{label}{self.ent_id_sep}{ent_id}"
-        return label
-
-    def from_bytes(
-        self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityRuler":
-        """Load the entity ruler from a bytestring.
-
-        patterns_bytes (bytes): The bytestring to load.
-        RETURNS (EntityRuler): The loaded entity ruler.
-
-        DOCS: https://spacy.io/api/entityruler#from_bytes
-        """
-        cfg = srsly.msgpack_loads(patterns_bytes)
-        self.clear()
-        if isinstance(cfg, dict):
-            self.add_patterns(cfg.get("patterns", cfg))
-            self.overwrite = cfg.get("overwrite", False)
-            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab,
-                attr=self.phrase_matcher_attr,
-            )
-            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
-        else:
-            self.add_patterns(cfg)
-        return self
-
-    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
-        """Serialize the entity ruler patterns to a bytestring.
-
-        RETURNS (bytes): The serialized patterns.
-
-        DOCS: https://spacy.io/api/entityruler#to_bytes
-        """
-        serial = {
-            "overwrite": self.overwrite,
-            "ent_id_sep": self.ent_id_sep,
-            "phrase_matcher_attr": self.phrase_matcher_attr,
-            "patterns": self.patterns,
-        }
-        return srsly.msgpack_dumps(serial)
-
-    def from_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityRuler":
-        """Load the entity ruler from a file. Expects a file containing
-        newline-delimited JSON (JSONL) with one entry per line.
-
-        path (str / Path): The JSONL file to load.
-        RETURNS (EntityRuler): The loaded entity ruler.
-
-        DOCS: https://spacy.io/api/entityruler#from_disk
-        """
-        path = ensure_path(path)
-        self.clear()
-        depr_patterns_path = path.with_suffix(".jsonl")
-        if path.suffix == ".jsonl":  # user provides a jsonl
-            if path.is_file:
-                patterns = srsly.read_jsonl(path)
-                self.add_patterns(patterns)
-            else:
-                raise ValueError(Errors.E1023.format(path=path))
-        elif depr_patterns_path.is_file():
-            patterns = srsly.read_jsonl(depr_patterns_path)
-            self.add_patterns(patterns)
-        elif path.is_dir():  # path is a valid directory
-            cfg = {}
-            deserializers_patterns = {
-                "patterns": lambda p: self.add_patterns(
-                    srsly.read_jsonl(p.with_suffix(".jsonl"))
-                )
-            }
-            deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
-            from_disk(path, deserializers_cfg, {})
-            self.overwrite = cfg.get("overwrite", False)
-            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
-            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
-
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab, attr=self.phrase_matcher_attr
-            )
-            from_disk(path, deserializers_patterns, {})
-        else:  # path is not a valid directory or file
-            raise ValueError(Errors.E146.format(path=path))
-        return self
-
-    def to_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> None:
-        """Save the entity ruler patterns to a directory. The patterns will be
-        saved as newline-delimited JSON (JSONL).
-
-        path (str / Path): The JSONL file to save.
-
-        DOCS: https://spacy.io/api/entityruler#to_disk
-        """
-        path = ensure_path(path)
-        cfg = {
-            "overwrite": self.overwrite,
-            "phrase_matcher_attr": self.phrase_matcher_attr,
-            "ent_id_sep": self.ent_id_sep,
-        }
-        serializers = {
-            "patterns": lambda p: srsly.write_jsonl(
-                p.with_suffix(".jsonl"), self.patterns
-            ),
-            "cfg": lambda p: srsly.write_json(p, cfg),
-        }
-        if path.suffix == ".jsonl":  # user wants to save only JSONL
-            srsly.write_jsonl(path, self.patterns)
-        else:
-            to_disk(path, serializers, {})
diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index 2a5e2179a35..4875c5e4bff 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -17,6 +17,14 @@
 
 import srsly
 
+from .pipe import Pipe
+from ..training import Example
+from ..language import Language
+from ..errors import Errors, Warnings
+from ..util import ensure_path, SimpleFrozenList, registry
+from ..tokens import Doc, Span
+from ..scorer import Scorer, get_ner_prf
+from ..matcher import Matcher, PhraseMatcher
 from .. import util
 from ..errors import Errors, Warnings
 from ..language import Language
@@ -33,7 +41,7 @@
 
 
 @Language.factory(
-    "future_entity_ruler",
+    "entity_ruler",
     assigns=["doc.ents"],
     default_config={
         "phrase_matcher_attr": None,
@@ -79,6 +87,15 @@ def make_entity_ruler(
     )
 
 
+def entity_ruler_score(examples, **kwargs):
+    return get_ner_prf(examples)
+
+
+@registry.scorers("spacy.entity_ruler_scorer.v1")
+def make_entity_ruler_scorer():
+    return entity_ruler_score
+
+
 @Language.factory(
     "span_ruler",
     assigns=["doc.spans"],
@@ -136,7 +153,7 @@ def prioritize_new_ents_filter(
 ) -> List[Span]:
     """Merge entities and spans into one list without overlaps by allowing
     spans to overwrite any entities that they overlap with. Intended to
-    replicate the overwrite_ents=True behavior from the EntityRuler.
+    replicate the overwrite_ents=True behavior from the v3 EntityRuler.
 
     entities (Iterable[Span]): The entities, already filtered for overlaps.
     spans (Iterable[Span]): The spans to merge, may contain overlaps.
@@ -167,7 +184,7 @@ def prioritize_existing_ents_filter(
 ) -> List[Span]:
     """Merge entities and spans into one list without overlaps by prioritizing
     existing entities. Intended to replicate the overwrite_ents=False behavior
-    from the EntityRuler.
+    from the v3 EntityRuler.
 
     entities (Iterable[Span]): The entities, already filtered for overlaps.
     spans (Iterable[Span]): The spans to merge, may contain overlaps.
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 4ad234cba3b..629f402f38e 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -87,14 +87,15 @@ def test_issue4373():
 
 @pytest.mark.issue(4651)
 def test_issue4651_with_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialized correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
+    """Test that the entity_ruler PhraseMatcher is deserialized correctly using
+    the method from_disk when the entity_ruler argument phrase_matcher_attr is
     specified.
     """
     text = "Spacy is a python library for nlp"
     nlp = English()
     patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
+    config = {"phrase_matcher_attr": "LOWER"}
+    ruler = nlp.add_pipe("entity_ruler", config=config)
     ruler.add_patterns(patterns)
     doc = nlp(text)
     res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
@@ -102,7 +103,7 @@ def test_issue4651_with_phrase_matcher_attr():
     with make_tempdir() as d:
         file_path = d / "entityruler"
         ruler.to_disk(file_path)
-        nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path)
+        nlp_reloaded.add_pipe("entity_ruler", config=config).from_disk(file_path)
     doc_reloaded = nlp_reloaded(text)
     res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
     assert res == res_reloaded
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index d0ab003919e..9f5204006ec 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -2,6 +2,12 @@
 from thinc.api import NumpyOps, get_current_ops
 
 from spacy import registry
+from spacy.tokens import Doc, Span
+from spacy.language import Language
+from spacy.lang.en import English
+from spacy.pipeline import EntityRecognizer, merge_entities
+from spacy.pipeline import SpanRuler
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
 from spacy.lang.en import English
 from spacy.language import Language
@@ -10,8 +16,6 @@
 from spacy.tests.util import make_tempdir
 from spacy.tokens import Doc, Span
 
-ENTITY_RULERS = ["entity_ruler", "future_entity_ruler"]
-
 
 @pytest.fixture
 def nlp():
@@ -38,13 +42,12 @@ def add_ent_component(doc):
 
 
 @pytest.mark.issue(3345)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue3345(entity_ruler_factory):
+def test_issue3345():
     """Test case where preset entity crosses sentence boundary."""
     nlp = English()
     doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
     doc[4].is_sent_start = True
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns([{"label": "GPE", "pattern": "New York"}])
     cfg = {"model": DEFAULT_NER_MODEL}
     model = registry.resolve(cfg, validate=True)["model"]
@@ -63,15 +66,14 @@ def test_issue3345(entity_ruler_factory):
 
 
 @pytest.mark.issue(4849)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue4849(entity_ruler_factory):
+def test_issue4849():
     nlp = English()
     patterns = [
         {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
         {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
     ]
     ruler = nlp.add_pipe(
-        entity_ruler_factory,
+        "entity_ruler",
         name="entity_ruler",
         config={"phrase_matcher_attr": "LOWER"},
     )
@@ -94,11 +96,10 @@ def test_issue4849(entity_ruler_factory):
 
 
 @pytest.mark.issue(5918)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue5918(entity_ruler_factory):
+def test_issue5918():
     # Test edge case when merging entities.
     nlp = English()
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "ORG", "pattern": "Digicon Inc"},
         {"label": "ORG", "pattern": "Rotan Mosle Inc's"},
@@ -123,10 +124,9 @@ def test_issue5918(entity_ruler_factory):
 
 
 @pytest.mark.issue(8168)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_issue8168(entity_ruler_factory):
+def test_issue8168():
     nlp = English()
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "ORG", "pattern": "Apple"},
         {
@@ -146,12 +146,9 @@ def test_issue8168(entity_ruler_factory):
 
 
 @pytest.mark.issue(8216)
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_fix8216(nlp, patterns):
     """Test that patterns don't get added excessively."""
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"validate": True}
-    )
+    ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
     ruler.add_patterns(patterns)
     pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
     assert pattern_count > 0
@@ -160,16 +157,15 @@ def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory):
     assert after_count == pattern_count
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_init(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_init(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     assert "HELLO" in ruler
     assert "BYE" in ruler
     nlp.remove_pipe("entity_ruler")
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     doc = nlp("hello world bye bye")
     assert len(doc.ents) == 2
@@ -177,23 +173,21 @@ def test_entity_ruler_init(nlp, patterns, entity_ruler_factory):
     assert doc.ents[1].label_ == "BYE"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_no_patterns_warns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_no_patterns_warns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     assert len(ruler) == 0
     assert len(ruler.labels) == 0
     nlp.remove_pipe("entity_ruler")
-    nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    nlp.add_pipe("entity_ruler")
     assert nlp.pipe_names == ["entity_ruler"]
     with pytest.warns(UserWarning):
         doc = nlp("hello world bye bye")
     assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_init_patterns(nlp, patterns):
     # initialize with patterns
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     assert len(ruler.labels) == 0
     ruler.initialize(lambda: [], patterns=patterns)
     assert len(ruler.labels) == 4
@@ -205,7 +199,7 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
     nlp.config["initialize"]["components"]["entity_ruler"] = {
         "patterns": {"@misc": "entity_ruler_patterns"}
     }
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     assert len(ruler.labels) == 0
     nlp.initialize()
     assert len(ruler.labels) == 4
@@ -214,20 +208,18 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
     assert doc.ents[1].label_ == "BYE"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_init_clear(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_init_clear(nlp, patterns):
     """Test that initialization clears patterns."""
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler.labels) == 4
     ruler.initialize(lambda: [])
     assert len(ruler.labels) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_clear(nlp, patterns):
     """Test that initialization clears patterns."""
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler.labels) == 4
     doc = nlp("hello world")
@@ -239,9 +231,8 @@ def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory):
     assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_existing(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     nlp.add_pipe("add_ent", before="entity_ruler")
     doc = nlp("OH HELLO WORLD bye bye")
@@ -250,11 +241,8 @@ def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory):
     assert doc.ents[1].label_ == "BYE"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
-    )
+def test_entity_ruler_existing_overwrite(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
     ruler.add_patterns(patterns)
     nlp.add_pipe("add_ent", before="entity_ruler")
     doc = nlp("OH HELLO WORLD bye bye")
@@ -264,11 +252,8 @@ def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory):
     assert doc.ents[1].label_ == "BYE"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
-    )
+def test_entity_ruler_existing_complex(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
     ruler.add_patterns(patterns)
     nlp.add_pipe("add_ent", before="entity_ruler")
     doc = nlp("foo foo bye bye")
@@ -279,11 +264,8 @@ def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory):
     assert len(doc.ents[1]) == 2
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(
-        entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
-    )
+def test_entity_ruler_entity_id(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
     ruler.add_patterns(patterns)
     doc = nlp("Apple is a technology company")
     assert len(doc.ents) == 1
@@ -291,26 +273,23 @@ def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory):
     assert doc.ents[0].ent_id_ == "a1"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_cfg_ent_id_sep(nlp, patterns, entity_ruler_factory):
+def test_entity_ruler_cfg_ent_id_sep(nlp, patterns):
     config = {"overwrite_ents": True, "ent_id_sep": "**"}
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler", config=config)
+    ruler = nlp.add_pipe("entity_ruler", config=config)
     ruler.add_patterns(patterns)
     doc = nlp("Apple is a technology company")
-    if isinstance(ruler, EntityRuler):
-        assert "TECH_ORG**a1" in ruler.phrase_patterns
     assert len(doc.ents) == 1
     assert doc.ents[0].label_ == "TECH_ORG"
     assert doc.ents[0].ent_id_ == "a1"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory):
-    ruler = EntityRuler(nlp, patterns=patterns)
+def test_entity_ruler_serialize_bytes(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     ruler_bytes = ruler.to_bytes()
-    new_ruler = EntityRuler(nlp)
+    new_ruler = nlp.add_pipe("entity_ruler", name="new_ruler")
     assert len(new_ruler) == 0
     assert len(new_ruler.labels) == 0
     new_ruler = new_ruler.from_bytes(ruler_bytes)
@@ -322,28 +301,27 @@ def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory):
     assert sorted(new_ruler.labels) == sorted(ruler.labels)
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_phrase_matcher_attr_bytes(
-    nlp, patterns, entity_ruler_factory
-):
-    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER", patterns=patterns)
+def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
+    ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     ruler_bytes = ruler.to_bytes()
-    new_ruler = EntityRuler(nlp)
+    new_ruler = nlp.add_pipe(
+        "entity_ruler", name="new_ruler", config={"phrase_matcher_attr": "LOWER"}
+    )
     assert len(new_ruler) == 0
     assert len(new_ruler.labels) == 0
-    assert new_ruler.phrase_matcher_attr is None
     new_ruler = new_ruler.from_bytes(ruler_bytes)
     assert len(new_ruler) == len(patterns)
     assert len(new_ruler.labels) == 4
-    assert new_ruler.phrase_matcher_attr == "LOWER"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_validate(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
-    validated_ruler = EntityRuler(nlp, validate=True)
+def test_entity_ruler_validate(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
+    validated_ruler = nlp.add_pipe(
+        "entity_ruler", name="validated_ruler", config={"validate": True}
+    )
 
     valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
     invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
@@ -360,16 +338,15 @@ def test_entity_ruler_validate(nlp, entity_ruler_factory):
         validated_ruler.add_patterns([invalid_pattern])
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_properties(nlp, patterns, entity_ruler_factory):
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+def test_entity_ruler_properties(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
+    ruler.add_patterns(patterns)
     assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"])
-    assert sorted(ruler.ent_ids) == ["a1", "a2"]
+    assert sorted(ruler.ids) == ["a1", "a2"]
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_overlapping_spans(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "FOOBAR", "pattern": "foo bar"},
         {"label": "BARBAZ", "pattern": "bar baz"},
@@ -418,14 +395,13 @@ def make_test_fuzzy_compare_disabled():
 
 
 @pytest.mark.parametrize("n_process", [1, 2])
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
+def test_entity_ruler_multiprocessing(nlp, n_process):
     if isinstance(get_current_ops, NumpyOps) or n_process < 2:
         texts = ["I enjoy eating Pizza Hut pizza."]
 
         patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}]
 
-        ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+        ruler = nlp.add_pipe("entity_ruler")
         ruler.add_patterns(patterns)
 
         for doc in nlp.pipe(texts, n_process=2):
@@ -433,9 +409,8 @@ def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
                 assert ent.ent_id_ == "1234"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_serialize_jsonl(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     with make_tempdir() as d:
         ruler.to_disk(d / "test_ruler.jsonl")
@@ -444,9 +419,8 @@ def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory):
             ruler.from_disk(d / "non_existing.jsonl")  # read from a bad jsonl file
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_serialize_dir(nlp, patterns):
+    ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     with make_tempdir() as d:
         ruler.to_disk(d / "test_ruler")
@@ -455,9 +429,8 @@ def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory):
             ruler.from_disk(d / "non_existing_dir")  # read from a bad directory
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_basic(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_basic(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -467,24 +440,16 @@ def test_entity_ruler_remove_basic(nlp, entity_ruler_factory):
     doc = nlp("Dina went to school")
     assert len(ruler.patterns) == 3
     assert len(doc.ents) == 1
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" in ruler.phrase_matcher
     assert doc.ents[0].label_ == "PERSON"
     assert doc.ents[0].text == "Dina"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
     doc = nlp("Dina went to school")
     assert len(doc.ents) == 0
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" not in ruler.phrase_matcher
     assert len(ruler.patterns) == 2
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_same_id_multiple_patterns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "DinaCorp", "id": "dina"},
@@ -493,25 +458,15 @@ def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory
     ruler.add_patterns(patterns)
     doc = nlp("Dina founded DinaCorp and ACME.")
     assert len(ruler.patterns) == 3
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" in ruler.phrase_matcher
-        assert "ORG||dina" in ruler.phrase_matcher
     assert len(doc.ents) == 3
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
     doc = nlp("Dina founded DinaCorp and ACME.")
     assert len(ruler.patterns) == 1
-    if isinstance(ruler, EntityRuler):
-        assert "PERSON||dina" not in ruler.phrase_matcher
-        assert "ORG||dina" not in ruler.phrase_matcher
     assert len(doc.ents) == 1
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_nonexisting_pattern(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -526,9 +481,8 @@ def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory):
             ruler.remove_by_id("nepattern")
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_several_patterns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -542,27 +496,20 @@ def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory):
     assert doc.ents[0].text == "Dina"
     assert doc.ents[1].label_ == "ORG"
     assert doc.ents[1].text == "ACME"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
     doc = nlp("Dina founded her company ACME")
     assert len(ruler.patterns) == 2
     assert len(doc.ents) == 1
     assert doc.ents[0].label_ == "ORG"
     assert doc.ents[0].text == "ACME"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("acme")
-    else:
-        ruler.remove_by_id("acme")
+    ruler.remove_by_id("acme")
     doc = nlp("Dina founded her company ACME")
     assert len(ruler.patterns) == 1
     assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_patterns_in_a_row(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -578,21 +525,15 @@ def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory):
     assert doc.ents[1].text == "ACME"
     assert doc.ents[2].label_ == "DATE"
     assert doc.ents[2].text == "her birthday"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-        ruler.remove("acme")
-        ruler.remove("bday")
-    else:
-        ruler.remove_by_id("dina")
-        ruler.remove_by_id("acme")
-        ruler.remove_by_id("bday")
+    ruler.remove_by_id("dina")
+    ruler.remove_by_id("acme")
+    ruler.remove_by_id("bday")
     doc = nlp("Dina went to school")
     assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_all_patterns(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [
         {"label": "PERSON", "pattern": "Dina", "id": "dina"},
         {"label": "ORG", "pattern": "ACME", "id": "acme"},
@@ -600,29 +541,19 @@ def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory):
     ]
     ruler.add_patterns(patterns)
     assert len(ruler.patterns) == 3
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("dina")
-    else:
-        ruler.remove_by_id("dina")
+    ruler.remove_by_id("dina")
     assert len(ruler.patterns) == 2
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("acme")
-    else:
-        ruler.remove_by_id("acme")
+    ruler.remove_by_id("acme")
     assert len(ruler.patterns) == 1
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("bday")
-    else:
-        ruler.remove_by_id("bday")
+    ruler.remove_by_id("bday")
     assert len(ruler.patterns) == 0
     with pytest.warns(UserWarning):
         doc = nlp("Dina founded her company ACME on her birthday")
         assert len(doc.ents) == 0
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+def test_entity_ruler_remove_and_add(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "DATE", "pattern": "last time"}]
     ruler.add_patterns(patterns)
     doc = ruler(
@@ -643,10 +574,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
     assert doc.ents[0].text == "last time"
     assert doc.ents[1].label_ == "DATE"
     assert doc.ents[1].text == "this time"
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("ttime")
-    else:
-        ruler.remove_by_id("ttime")
+    ruler.remove_by_id("ttime")
     doc = ruler(
         nlp.make_doc("I saw him last time we met, this time he brought some flowers")
     )
@@ -669,10 +597,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
     )
     assert len(ruler.patterns) == 3
     assert len(doc.ents) == 3
-    if isinstance(ruler, EntityRuler):
-        ruler.remove("ttime")
-    else:
-        ruler.remove_by_id("ttime")
+    ruler.remove_by_id("ttime")
     doc = ruler(
         nlp.make_doc(
             "I saw him last time we met, this time he brought some flowers, another time some chocolate."
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 6bbe743a12d..8170488f758 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -8,15 +8,9 @@
 from spacy import Vocab, load, registry
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import (
-    DependencyParser,
-    EntityRecognizer,
-    EntityRuler,
-    SentenceRecognizer,
-    Tagger,
-    TextCategorizer,
-    TrainablePipe,
-)
+from spacy.pipeline import DependencyParser, EntityRecognizer
+from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer
+from spacy.pipeline import TrainablePipe
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
@@ -91,58 +85,17 @@ def test_issue_3526_1(en_vocab):
         {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
     ]
     nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
+    ruler.add_patterns(patterns)
     ruler_bytes = ruler.to_bytes()
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
-    assert ruler.overwrite
-    new_ruler = EntityRuler(nlp)
+    new_ruler = nlp.add_pipe(
+        "entity_ruler", name="new_ruler", config={"overwrite_ents": True}
+    )
     new_ruler = new_ruler.from_bytes(ruler_bytes)
     assert len(new_ruler) == len(ruler)
     assert len(new_ruler.labels) == 4
-    assert new_ruler.overwrite == ruler.overwrite
-    assert new_ruler.ent_id_sep == ruler.ent_id_sep
-
-
-@pytest.mark.issue(3526)
-def test_issue_3526_2(en_vocab):
-    patterns = [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-    ]
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
-    new_ruler = EntityRuler(nlp)
-    new_ruler = new_ruler.from_bytes(bytes_old_style)
-    assert len(new_ruler) == len(ruler)
-    for pattern in ruler.patterns:
-        assert pattern in new_ruler.patterns
-    assert new_ruler.overwrite is not ruler.overwrite
-
-
-@pytest.mark.issue(3526)
-def test_issue_3526_3(en_vocab):
-    patterns = [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-    ]
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    with make_tempdir() as tmpdir:
-        out_file = tmpdir / "entity_ruler"
-        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
-        new_ruler = EntityRuler(nlp).from_disk(out_file)
-        for pattern in ruler.patterns:
-            assert pattern in new_ruler.patterns
-        assert len(new_ruler) == len(ruler)
-        assert new_ruler.overwrite is not ruler.overwrite
 
 
 @pytest.mark.issue(3526)
@@ -156,16 +109,14 @@ def test_issue_3526_4(en_vocab):
         nlp.to_disk(tmpdir)
         ruler = nlp.get_pipe("entity_ruler")
         assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert ruler.overwrite is True
         nlp2 = load(tmpdir)
         new_ruler = nlp2.get_pipe("entity_ruler")
         assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert new_ruler.overwrite is True
 
 
 @pytest.mark.issue(4042)
 def test_issue4042():
-    """Test that serialization of an EntityRuler before NER works fine."""
+    """Test that serialization of an entity_ruler before NER works fine."""
     nlp = English()
     # add ner pipe
     ner = nlp.add_pipe("ner")
diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx
index a35b6e2566c..7976e7725e0 100644
--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@@ -1,13 +1,24 @@
 ---
 title: EntityRuler
-tag: class
-source: spacy/pipeline/entity_ruler.py
 new: 2.1
 teaser: 'Pipeline component for rule-based named entity recognition'
 api_string_name: entity_ruler
 api_trainable: false
 ---
 
+<Infobox title="New in v4" variant="warning">
+
+As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is
+implemented as a special case of the `SpanRuler` component.
+
+See the [migration guide](#migrating) below for differences between the v3
+`EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler`
+component.
+
+See the [`SpanRuler`](/api/spanruler) API docs for the full API.
+
+</Infobox>
+
 The entity ruler lets you add spans to the [`Doc.ents`](/api/doc#ents) using
 token-based rules or exact phrase matches. It can be combined with the
 statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or
@@ -64,273 +75,51 @@ how the component should be configured. You can override its settings via the
 | `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                       |
 | `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                 |
 
-```python
-%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py
-```
-
-## EntityRuler.\_\_init\_\_ {id="init",tag="method"}
-
-Initialize the entity ruler. If patterns are supplied here, they need to be a
-list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either
-be a token pattern (list) or a phrase pattern (string). For example:
-`{"label": "ORG", "pattern": "Apple"}`.
-
-> #### Example
->
-> ```python
-> # Construction via add_pipe
-> ruler = nlp.add_pipe("entity_ruler")
->
-> # Construction from class
-> from spacy.pipeline import EntityRuler
-> ruler = EntityRuler(nlp, overwrite_ents=True)
-> ```
-
-| Name                                                 | Description                                                                                                                                                                                                                           |
-| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `nlp`                                                | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~                                                                                                                                     |
-| `name` <Tag variant="new">3</Tag>                    | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
-| _keyword-only_                                       |                                                                                                                                                                                                                                       |
-| `phrase_matcher_attr`                                | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~                                         |
-| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~                                                                                                     |
-| `validate`                                           | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~                                                                                                                |
-| `overwrite_ents`                                     | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~                                                                                             |
-| `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                                                               |
-| `patterns`                                           | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~                                                                                                                                 |
-| `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                                                         |
-
-## EntityRuler.initialize {id="initialize",tag="method",version="3"}
-
-Initialize the component with data and used before training to load in rules
-from a [pattern file](/usage/rule-based-matching/#entityruler-files). This
-method is typically called by [`Language.initialize`](/api/language#initialize)
-and lets you customize arguments it receives via the
-[`[initialize.components]`](/api/data-formats#config-initialize) block in the
-config.
-
-> #### Example
->
-> ```python
-> entity_ruler = nlp.add_pipe("entity_ruler")
-> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
-> ```
->
-> ```ini
-> ### config.cfg
-> [initialize.components.entity_ruler]
->
-> [initialize.components.entity_ruler.patterns]
-> @readers = "srsly.read_jsonl.v1"
-> path = "corpus/entity_ruler_patterns.jsonl
-> ```
-
-| Name           | Description                                                                                                                                                          |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
-| _keyword-only_ |                                                                                                                                                                      |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                 |
-| `patterns`     | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~                                                        |
-
-## EntityRuler.\_\_len\_\_ {id="len",tag="method"}
-
-The number of all patterns added to the entity ruler.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> assert len(ruler) == 0
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-> assert len(ruler) == 1
-> ```
-
-| Name        | Description                     |
-| ----------- | ------------------------------- |
-| **RETURNS** | The number of patterns. ~~int~~ |
-
-## EntityRuler.\_\_contains\_\_ {id="contains",tag="method"}
-
-Whether a label is present in the patterns.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-> assert "ORG" in ruler
-> assert not "PERSON" in ruler
-> ```
-
-| Name        | Description                                           |
-| ----------- | ----------------------------------------------------- |
-| `label`     | The label to check. ~~str~~                           |
-| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ |
-
-## EntityRuler.\_\_call\_\_ {id="call",tag="method"}
-
-Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
-happens automatically after the component has been added to the pipeline using
-[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
-with `overwrite_ents=True`, existing entities will be replaced if they overlap
-with the matches. When matches overlap in a Doc, the entity ruler prioritizes
-longer patterns over shorter, and if equal the match occuring first in the Doc
-is chosen.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
->
-> doc = nlp("A text about Apple.")
-> ents = [(ent.text, ent.label_) for ent in doc.ents]
-> assert ents == [("Apple", "ORG")]
-> ```
-
-| Name        | Description                                                          |
-| ----------- | -------------------------------------------------------------------- |
-| `doc`       | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
-| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~        |
+## Migrating from v3 {#migrating}
 
-## EntityRuler.add_patterns {id="add_patterns",tag="method"}
+### Loading patterns
 
-Add patterns to the entity ruler. A pattern can either be a token pattern (list
-of dicts) or a phrase pattern (string). For more details, see the usage guide on
-[rule-based matching](/usage/rule-based-matching).
+Unlike the v3 `EntityRuler`, the `SpanRuler` cannot load patterns on
+initialization with `SpanRuler(patterns=patterns)` or directly from a JSONL file
+path with `SpanRuler.from_disk(jsonl_path)`. Patterns should be loaded from the
+JSONL file separately and then added through
+[`SpanRuler.initialize`](/api/spanruler#initialize]) or
+[`SpanRuler.add_patterns`](/api/spanruler#add_patterns).
 
-> #### Example
->
-> ```python
-> patterns = [
->     {"label": "ORG", "pattern": "Apple"},
->     {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}
-> ]
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns(patterns)
-> ```
-
-| Name       | Description                                                      |
-| ---------- | ---------------------------------------------------------------- |
-| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
-
-## EntityRuler.remove {id="remove",tag="method",version="3.2.1"}
-
-Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if
-the ID does not exist.
-
-> #### Example
->
-> ```python
-> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}]
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns(patterns)
-> ruler.remove("apple")
-> ```
-
-| Name | Description                         |
-| ---- | ----------------------------------- |
-| `id` | The ID of the pattern rule. ~~str~~ |
-
-## EntityRuler.to_disk {id="to_disk",tag="method"}
-
-Save the entity ruler patterns to a directory. The patterns will be saved as
-newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided,
-only the patterns are saved as JSONL. If a directory name is provided, a
-`patterns.jsonl` and `cfg` file with the component configuration is exported.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.to_disk("/path/to/patterns.jsonl")  # saves patterns only
-> ruler.to_disk("/path/to/entity_ruler")    # saves patterns and config
-> ```
-
-| Name   | Description                                                                                                                                              |
-| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-
-## EntityRuler.from_disk {id="from_disk",tag="method"}
-
-Load the entity ruler from a path. Expects either a file containing
-newline-delimited JSON (JSONL) with one entry per line, or a directory
-containing a `patterns.jsonl` file and a `cfg` file with the component
-configuration.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.from_disk("/path/to/patterns.jsonl")  # loads patterns only
-> ruler.from_disk("/path/to/entity_ruler")    # loads patterns and config
-> ```
-
-| Name        | Description                                                                                                   |
-| ----------- | ------------------------------------------------------------------------------------------------------------- |
-| `path`      | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~                                                            |
-
-## EntityRuler.to_bytes {id="to_bytes",tag="method"}
-
-Serialize the entity ruler patterns to a bytestring.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler_bytes = ruler.to_bytes()
-> ```
-
-| Name        | Description                        |
-| ----------- | ---------------------------------- |
-| **RETURNS** | The serialized patterns. ~~bytes~~ |
-
-## EntityRuler.from_bytes {id="from_bytes",tag="method"}
-
-Load the pipe from a bytestring. Modifies the object in place and returns it.
-
-> #### Example
->
-> ```python
-> ruler_bytes = ruler.to_bytes()
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.from_bytes(ruler_bytes)
-> ```
-
-| Name         | Description                                        |
-| ------------ | -------------------------------------------------- |
-| `bytes_data` | The bytestring to load. ~~bytes~~                  |
-| **RETURNS**  | The modified `EntityRuler` object. ~~EntityRuler~~ |
-
-## EntityRuler.labels {id="labels",tag="property"}
-
-All labels present in the match patterns.
-
-| Name        | Description                            |
-| ----------- | -------------------------------------- |
-| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.from_disk("patterns.jsonl")
++ import srsly
++ patterns = srsly.read_jsonl("patterns.jsonl")
++ ruler.add_patterns(patterns)
+```
 
-## EntityRuler.ent_ids {id="ent_ids",tag="property",version="2.2.2"}
+### Saving patterns
 
-All entity IDs present in the `id` properties of the match patterns.
+`SpanRuler.to_disk` always saves the full component data to a directory and does
+not include an option to save the patterns to a single JSONL file.
 
-| Name        | Description                         |
-| ----------- | ----------------------------------- |
-| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.to_disk("patterns.jsonl")
++ import srsly
++ srsly.write_jsonl("patterns.jsonl", ruler.patterns)
+```
 
-## EntityRuler.patterns {id="patterns",tag="property"}
+### Accessing token and phrase patterns
 
-Get all patterns that were added to the entity ruler.
+The separate token patterns and phrase patterns are no longer accessible under
+`ruler.token_patterns` or `ruler.phrase_patterns`. You can access the combined
+patterns in their original format using the property
+[`SpanRuler.patterns`](/api/spanruler#patterns).
 
-| Name        | Description                                                                              |
-| ----------- | ---------------------------------------------------------------------------------------- |
-| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ |
+### Removing patterns by ID
 
-## Attributes {id="attributes"}
+[`SpanRuler.remove`](/api/spanruler#remove) removes by label rather than ID. To
+remove by ID, use [`SpanRuler.remove_by_id`](/api/spanruler#remove_by_id):
 
-| Name              | Description                                                                                                           |
-| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `matcher`         | The underlying matcher used to process token patterns. ~~Matcher~~                                                    |
-| `phrase_matcher`  | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~                                      |
-| `token_patterns`  | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
-| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~                             |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.remove("id")
++ ruler.remove_by_id("id")
+```
diff --git a/website/docs/api/spanruler.mdx b/website/docs/api/spanruler.mdx
index 5889b1906ad..1b6c558acef 100644
--- a/website/docs/api/spanruler.mdx
+++ b/website/docs/api/spanruler.mdx
@@ -13,7 +13,18 @@ The span ruler lets you add spans to [`Doc.spans`](/api/doc#spans) and/or
 usage examples, see the docs on
 [rule-based span matching](/usage/rule-based-matching#spanruler).
 
-## Assigned Attributes {id="assigned-attributes"}
+<Infobox title="Replacement of the EntityRuler" variant="warning">
+
+As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is
+implemented as a special case of the `SpanRuler` component.
+
+See the [migration guide](/api/entityruler#migrating) for differences between
+the v3 `EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler`
+component.
+
+</Infobox>
+
+## Assigned Attributes {#assigned-attributes}
 
 Matches will be saved to `Doc.spans[spans_key]` as a
 [`SpanGroup`](/api/spangroup) and/or to `Doc.ents`, where the annotation is
diff --git a/website/docs/usage/101/_architecture.mdx b/website/docs/usage/101/_architecture.mdx
index 2a63a3741fa..35c36088ab9 100644
--- a/website/docs/usage/101/_architecture.mdx
+++ b/website/docs/usage/101/_architecture.mdx
@@ -41,25 +41,27 @@ components for different language processing tasks and also allows adding
 
 ![The processing pipeline](/images/pipeline.svg)
 
-| Name                                            | Description                                                                                 |
-| ----------------------------------------------- | ------------------------------------------------------------------------------------------- |
-| [`AttributeRuler`](/api/attributeruler)         | Set token attributes using matcher rules.                                                   |
-| [`DependencyParser`](/api/dependencyparser)     | Predict syntactic dependencies.                                                             |
-| [`EditTreeLemmatizer`](/api/edittreelemmatizer) | Predict base forms of words.                                                                |
-| [`EntityLinker`](/api/entitylinker)             | Disambiguate named entities to nodes in a knowledge base.                                   |
-| [`EntityRecognizer`](/api/entityrecognizer)     | Predict named entities, e.g. persons or products.                                           |
-| [`EntityRuler`](/api/entityruler)               | Add entity spans to the `Doc` using token-based rules or exact phrase matches.              |
-| [`Lemmatizer`](/api/lemmatizer)                 | Determine the base forms of words using rules and lookups.                                  |
-| [`Morphologizer`](/api/morphologizer)           | Predict morphological features and coarse-grained part-of-speech tags.                      |
-| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries.                                                                |
-| [`Sentencizer`](/api/sentencizer)               | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
-| [`Tagger`](/api/tagger)                         | Predict part-of-speech tags.                                                                |
-| [`TextCategorizer`](/api/textcategorizer)       | Predict categories or labels over the whole document.                                       |
-| [`Tok2Vec`](/api/tok2vec)                       | Apply a "token-to-vector" model and set its outputs.                                        |
-| [`Tokenizer`](/api/tokenizer)                   | Segment raw text and create `Doc` objects from the words.                                   |
-| [`TrainablePipe`](/api/pipe)                    | Class that all trainable pipeline components inherit from.                                  |
-| [`Transformer`](/api/transformer)               | Use a transformer model and set its outputs.                                                |
-| [Other functions](/api/pipeline-functions)      | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                  |
+| Component name         | Component class                                      | Description                                                                                 |
+| ---------------------- | ---------------------------------------------------- | ------------------------------------------------------------------------------------------- |
+| `attribute_ruler`      | [`AttributeRuler`](/api/attributeruler)              | Set token attributes using matcher rules.                                                   |
+| `entity_linker`        | [`EntityLinker`](/api/entitylinker)                  | Disambiguate named entities to nodes in a knowledge base.                                   |
+| `entity_ruler`         | [`SpanRuler`](/api/spanruler)                        | Add entity spans to the `Doc` using token-based rules or exact phrase matches.              |
+| `lemmatizer`           | [`Lemmatizer`](/api/lemmatizer)                      | Determine the base forms of words using rules and lookups.                                  |
+| `morphologizer`        | [`Morphologizer`](/api/morphologizer)                | Predict morphological features and coarse-grained part-of-speech tags.                      |
+| `ner`                  | [`EntityRecognizer`](/api/entityrecognizer)          | Predict named entities, e.g. persons or products.                                           |
+| `parser`               | [`DependencyParser`](/api/dependencyparser)          | Predict syntactic dependencies.                                                             |
+| `senter`               | [`SentenceRecognizer`](/api/sentencerecognizer)      | Predict sentence boundaries.                                                                |
+| `sentencizer`          | [`Sentencizer`](/api/sentencizer)                    | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
+| `span_ruler`           | [`SpanRuler`](/api/spanruler)                        | Add spans to the `Doc` using token-based rules or exact phrase matches.                     |
+| `tagger`               | [`Tagger`](/api/tagger)                              | Predict part-of-speech tags.                                                                |
+| `textcat`              | [`TextCategorizer`](/api/textcategorizer)            | Predict exactly one category or label over a whole document.                                |
+| `textcat_multilabel`   | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Predict 0, 1 or more categories or labels over a whole document.                            |
+| `tok2vec`              | [`Tok2Vec`](/api/tok2vec)                            | Apply a "token-to-vector" model and set its outputs.                                        |
+| `tokenizer`            | [`Tokenizer`](/api/tokenizer)                        | Segment raw text and create `Doc` objects from the words.                                   |
+| `trainable_lemmatizer` | [`EditTreeLemmatizer`](/api/edittreelemmatizer)      | Predict base forms of words.                                                                |
+| `transformer`          | [`Transformer`](/api/transformer)                    | Use a transformer model and set its outputs.                                                |
+| -                      | [`TrainablePipe`](/api/pipe)                         | Class that all trainable pipeline components inherit from.                                  |
+| -                      | [Other functions](/api/pipeline-functions)           | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                  |
 
 ### Matchers {id="architecture-matchers"}
 
diff --git a/website/docs/usage/101/_pipelines.mdx b/website/docs/usage/101/_pipelines.mdx
index 315291762ff..e5a08c5e424 100644
--- a/website/docs/usage/101/_pipelines.mdx
+++ b/website/docs/usage/101/_pipelines.mdx
@@ -51,9 +51,9 @@ example, a custom lemmatizer may need the part-of-speech tags assigned, so it'll
 only work if it's added after the tagger. The parser will respect pre-defined
 sentence boundaries, so if a previous component in the pipeline sets them, its
 dependency predictions may be different. Similarly, it matters if you add the
-[`EntityRuler`](/api/entityruler) before or after the statistical entity
-recognizer: if it's added before, the entity recognizer will take the existing
-entities into account when making predictions. The
+[`SpanRuler`](/api/spanruler) before or after the statistical entity recognizer:
+if it's added before and it is writing to `doc.ents`, then the entity recognizer
+will take those existing entities into account when making predictions. The
 [`EntityLinker`](/api/entitylinker), which resolves named entities to knowledge
 base IDs, should be preceded by a pipeline component that recognizes entities
 such as the [`EntityRecognizer`](/api/entityrecognizer).
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index 3e58b251dec..ec93aee2cf3 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -297,13 +297,14 @@ available pipeline components and component functions.
 > ruler = nlp.add_pipe("entity_ruler")
 > ```
 
-| String name            | Component                                            | Description                                                                               |
+| Component name         | Component class                                      | Description                                                                               |
 | ---------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- |
 | `tagger`               | [`Tagger`](/api/tagger)                              | Assign part-of-speech-tags.                                                               |
 | `parser`               | [`DependencyParser`](/api/dependencyparser)          | Assign dependency labels.                                                                 |
 | `ner`                  | [`EntityRecognizer`](/api/entityrecognizer)          | Assign named entities.                                                                    |
 | `entity_linker`        | [`EntityLinker`](/api/entitylinker)                  | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
-| `entity_ruler`         | [`EntityRuler`](/api/entityruler)                    | Assign named entities based on pattern rules and dictionaries.                            |
+| `span_ruler`           | [`SpanRuler`](/api/spanruler)                        | Assign spans based on pattern rules and dictionaries.                                     |
+| `entity_ruler`         | [`SpanRuler`](/api/spanruler)                        | Assign named entities based on pattern rules and dictionaries.                            |
 | `textcat`              | [`TextCategorizer`](/api/textcategorizer)            | Assign text categories: exactly one category is predicted per document.                   |
 | `textcat_multilabel`   | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document.   |
 | `lemmatizer`           | [`Lemmatizer`](/api/lemmatizer)                      | Assign base forms to words using rules and lookups.                                       |
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index c90172b4325..86220440991 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -511,7 +511,7 @@ matches = matcher(doc)
 ```
 
 A very similar logic has been implemented in the built-in
-[`EntityRuler`](/api/entityruler) by the way. It also takes care of handling
+[`entity_ruler`](/api/entityruler) by the way. It also takes care of handling
 overlapping matches, which you would otherwise have to take care of yourself.
 
 > #### Tip: Visualizing matches
@@ -1305,7 +1305,7 @@ of patterns such as `{}` that match any token in the sentence.
 
 ## Rule-based entity recognition {id="entityruler",version="2.1"}
 
-The [`EntityRuler`](/api/entityruler) is a component that lets you add named
+The [`entity_ruler`](/api/entityruler) is a component that lets you add named
 entities based on pattern dictionaries, which makes it easy to combine
 rule-based and statistical named entity recognition for even more powerful
 pipelines.
@@ -1330,13 +1330,12 @@ pattern. The entity ruler accepts two types of patterns:
 
 ### Using the entity ruler {id="entityruler-usage"}
 
-The [`EntityRuler`](/api/entityruler) is a pipeline component that's typically
-added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is
-called on a text, it will find matches in the `doc` and add them as entities to
-the `doc.ents`, using the specified pattern label as the entity label. If any
-matches were to overlap, the pattern matching most tokens takes priority. If
-they also happen to be equally long, then the match occurring first in the `Doc`
-is chosen.
+The `entity_ruler` is a pipeline component that's typically added via
+[`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is called on a
+text, it will find matches in the `doc` and add them as entities to `doc.ents`,
+using the specified pattern label as the entity label. If any matches were to
+overlap, the pattern matching most tokens takes priority. If they also happen to
+be equally long, then the match occurring first in the `Doc` is chosen.
 
 ```python {executable="true"}
 from spacy.lang.en import English
@@ -1372,7 +1371,7 @@ doc = nlp("MyCorp Inc. is a company in the U.S.")
 print([(ent.text, ent.label_) for ent in doc.ents])
 ```
 
-#### Validating and debugging EntityRuler patterns {id="entityruler-pattern-validation",version="2.1.8"}
+#### Validating and debugging entity ruler patterns {#entityruler-pattern-validation new="2.1.8"}
 
 The entity ruler can validate patterns against a JSON schema with the config
 setting `"validate"`. See details under
@@ -1384,9 +1383,9 @@ ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
 
 ### Adding IDs to patterns {id="entityruler-ent-ids",version="2.2.2"}
 
-The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each
-pattern. Using the `id` attribute allows multiple patterns to be associated with
-the same entity.
+The [`entity_ruler`](/api/entityruler) can also accept an `id` attribute for
+each pattern. Using the `id` attribute allows multiple patterns to be associated
+with the same entity.
 
 ```python {executable="true"}
 from spacy.lang.en import English
@@ -1405,10 +1404,10 @@ doc2 = nlp("Apple is opening its first big office in San Fran.")
 print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
 ```
 
-If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
-patterns, the `id_` property of the matched entity is set to the `id` given
-in the patterns. So in the example above it's easy to identify that "San
-Francisco" and "San Fran" are both the same entity.
+If the `id` attribute is included in the [`entity_ruler`](/api/entityruler)
+patterns, the `id_` property of the matched entity is set to the `id` given in
+the patterns. So in the example above it's easy to identify that "San Francisco"
+and "San Fran" are both the same entity.
 
 ### Using pattern files {id="entityruler-files"}
 
@@ -1431,13 +1430,13 @@ new_ruler = nlp.add_pipe("entity_ruler").from_disk("./patterns.jsonl")
 
 If you're using the [Prodigy](https://prodi.gy) annotation tool, you might
 recognize these pattern files from bootstrapping your named entity and text
-classification labelling. The patterns for the `EntityRuler` follow the same
+classification labelling. The patterns for the `entity_ruler` follow the same
 syntax, so you can use your existing Prodigy pattern files in spaCy, and vice
 versa.
 
 </Infobox>
 
-When you save out an `nlp` object that has an `EntityRuler` added to its
+When you save out an `nlp` object that has an `entity_ruler` added to its
 pipeline, its patterns are automatically exported to the pipeline directory:
 
 ```python
@@ -1460,9 +1459,9 @@ rules included!
 
 When using a large amount of **phrase patterns** (roughly > 10000) it's useful
 to understand how the `add_patterns` function of the entity ruler works. For
-each **phrase pattern**, the EntityRuler calls the nlp object to construct a doc
-object. This happens in case you try to add the EntityRuler at the end of an
-existing pipeline with, for example, a POS tagger and want to extract matches
+each **phrase pattern**, the entity ruler calls the nlp object to construct a
+doc object. This happens in case you try to add the entity ruler at the end of
+an existing pipeline with, for example, a POS tagger and want to extract matches
 based on the pattern's POS signature. In this case you would pass a config value
 of `"phrase_matcher_attr": "POS"` for the entity ruler.
 
diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx
index b44bd86ed06..97ae3c5e573 100644
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@@ -187,13 +187,13 @@ the data to and from a JSON file.
 
 > #### Real-world example
 >
-> To see custom serialization methods in action, check out the new
-> [`EntityRuler`](/api/entityruler) component and its
-> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the
+> To see custom serialization methods in action, check out the
+> [`SpanRuler`](/api/spanruler) component and its
+> [source](%%GITHUB_SPACY/spacy/pipeline/span_ruler.py). Patterns added to the
 > component will be saved to a `.jsonl` file if the pipeline is serialized to
 > disk, and to a bytestring if the pipeline is serialized to bytes. This allows
-> saving out a pipeline with a rule-based entity recognizer and including all
-> rules _with_ the component data.
+> saving out a pipeline with rule-based components _with_ all the component
+> data.
 
 ```python {highlight="16-23,25-30"}
 import json
diff --git a/website/docs/usage/training.mdx b/website/docs/usage/training.mdx
index abb1b9cfd91..eda3f355f1a 100644
--- a/website/docs/usage/training.mdx
+++ b/website/docs/usage/training.mdx
@@ -421,7 +421,7 @@ your components during training, and the most common scenarios are:
 2. Update an existing **trained component** with more examples.
 3. Include an existing trained component without updating it.
 4. Include a non-trainable component, like a rule-based
-   [`EntityRuler`](/api/entityruler) or [`Sentencizer`](/api/sentencizer), or a
+   [`SpanRuler`](/api/spanruler) or [`Sentencizer`](/api/sentencizer), or a
    fully [custom component](/usage/processing-pipelines#custom-components).
 
 If a component block defines a `factory`, spaCy will look it up in the

From 9f9ff47dc507b19bbee64088bab7e911831dd7e6 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Wed, 23 Nov 2022 13:09:32 +0100
Subject: [PATCH 243/504] Remove sentiment extension (#11722)

* remove sentiment attribute

* remove sentiment from docs

* add test for backwards compatibility

* replace from_disk with from_bytes

* Fix docs and format file

* Fix formatting
---
 spacy/lexeme.pyi                            |   1 -
 spacy/lexeme.pyx                            |  14 --
 spacy/tests/README.md                       |   2 +-
 spacy/tests/doc/test_doc_api.py             |  13 +-
 spacy/tests/doc/test_span.py                |  25 ---
 spacy/tests/matcher/test_matcher_api.py     |   3 -
 spacy/tokens/doc.pxd                        |   2 -
 spacy/tokens/doc.pyi                        |   1 -
 spacy/tokens/doc.pyx                        |   5 -
 spacy/tokens/span.pyi                       |   2 -
 spacy/tokens/span.pyx                       |  10 --
 spacy/tokens/token.pyi                      |   2 -
 spacy/tokens/token.pyx                      |   8 -
 website/docs/api/doc.mdx                    |   2 -
 website/docs/api/lexeme.md                  | 163 ++++++++++++++++++++
 website/docs/api/span.mdx                   |   1 -
 website/docs/api/token.mdx                  |   1 -
 website/docs/usage/processing-pipelines.mdx |   2 +-
 website/docs/usage/rule-based-matching.mdx  |  16 +-
 19 files changed, 185 insertions(+), 88 deletions(-)
 create mode 100644 website/docs/api/lexeme.md

diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi
index 9980b9fcefa..fb937d7b998 100644
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@@ -19,7 +19,6 @@ class Lexeme:
     def vector_norm(self) -> float: ...
     vector: Floats1d
     rank: int
-    sentiment: float
     @property
     def orth_(self) -> str: ...
     @property
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 7a0c19bf301..22d5b4a5c3e 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -193,20 +193,6 @@ cdef class Lexeme:
     def rank(self, value):
         self.c.id = value
 
-    @property
-    def sentiment(self):
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the lexeme."""
-        sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
-        return sentiment_table.get(self.c.orth, 0.0)
-
-    @sentiment.setter
-    def sentiment(self, float x):
-        if "lexeme_sentiment" not in self.vocab.lookups:
-            self.vocab.lookups.add_table("lexeme_sentiment")
-        sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
-        sentiment_table[self.c.orth] = x
-
     @property
     def orth_(self):
         """RETURNS (str): The original verbatim text of the lexeme
diff --git a/spacy/tests/README.md b/spacy/tests/README.md
index 82fabcc778b..f3c96a39e7c 100644
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@@ -40,7 +40,7 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
 
 To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:
 
-- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
+- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email`.
 - If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
 - Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
 - Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 73544c51a4f..946910b29e1 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -389,9 +389,7 @@ def test_doc_api_serialize(en_tokenizer, text):
     assert [t.text for t in tokens] == [t.text for t in new_tokens]
     assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
 
-    new_tokens = Doc(tokens.vocab).from_bytes(
-        tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"]
-    )
+    new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
     assert tokens.text == new_tokens.text
     assert [t.text for t in tokens] == [t.text for t in new_tokens]
     assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
@@ -999,3 +997,12 @@ def test_doc_spans_setdefault(en_tokenizer):
     assert len(doc.spans["key2"]) == 1
     doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]]))
     assert len(doc.spans["key3"]) == 2
+
+
+def test_doc_sentiment_from_bytes_v3_to_v4():
+    """Test if a doc with sentiment attribute created in v3.x works with '.from_bytes' in v4.x without throwing errors. The sentiment attribute was removed in v4"""
+    doc_bytes = b"\x89\xa4text\xa5happy\xaaarray_head\x9fGQACKOLMN\xcd\x01\xc4\xcd\x01\xc6I\xcd\x01\xc5JP\xaaarray_body\x85\xc4\x02nd\xc3\xc4\x04type\xa3<u8\xc4\x04kind\xc4\x00\xc4\x05shape\x92\x01\x0f\xc4\x04data\xc4x\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xa4\x9a\xd3\x17\xca\xf0b\x03\xa4\x9a\xd3\x17\xca\xf0b\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\xa9sentiment\xcb?\xf0\x00\x00\x00\x00\x00\x00\xa6tensor\x85\xc4\x02nd\xc3\xc4\x04type\xa3<f4\xc4\x04kind\xc4\x00\xc4\x05shape\x91\x00\xc4\x04data\xc4\x00\xa4cats\x80\xa5spans\xc4\x01\x90\xa7strings\x92\xa0\xa5happy\xb2has_unknown_spaces\xc2"
+    doc = Doc(Vocab()).from_bytes(doc_bytes)
+    assert doc.text == "happy"
+    with pytest.raises(AttributeError):
+        doc.sentiment == 1.0
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index ab8538b17dc..74874624888 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -306,31 +306,6 @@ def test_span_similarity_match():
         assert span1[:1].similarity(doc.vocab["a"]) == 1.0
 
 
-def test_spans_default_sentiment(en_tokenizer):
-    """Test span.sentiment property's default averaging behaviour"""
-    text = "good stuff bad stuff"
-    tokens = en_tokenizer(text)
-    tokens.vocab[tokens[0].text].sentiment = 3.0
-    tokens.vocab[tokens[2].text].sentiment = -2.0
-    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
-    assert doc[:2].sentiment == 3.0 / 2
-    assert doc[-2:].sentiment == -2.0 / 2
-    assert doc[:-1].sentiment == (3.0 + -2) / 3.0
-
-
-def test_spans_override_sentiment(en_tokenizer):
-    """Test span.sentiment property's default averaging behaviour"""
-    text = "good stuff bad stuff"
-    tokens = en_tokenizer(text)
-    tokens.vocab[tokens[0].text].sentiment = 3.0
-    tokens.vocab[tokens[2].text].sentiment = -2.0
-    doc = Doc(tokens.vocab, words=[t.text for t in tokens])
-    doc.user_span_hooks["sentiment"] = lambda span: 10.0
-    assert doc[:2].sentiment == 10.0
-    assert doc[-2:].sentiment == 10.0
-    assert doc[:-1].sentiment == 10.0
-
-
 def test_spans_are_hashable(en_tokenizer):
     """Test spans can be hashed."""
     text = "good stuff bad stuff"
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 106a00b3011..ecb4385dd90 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -51,8 +51,6 @@ def test_matcher_from_usage_docs(en_vocab):
 
     def label_sentiment(matcher, doc, i, matches):
         match_id, start, end = matches[i]
-        if doc.vocab.strings[match_id] == "HAPPY":
-            doc.sentiment += 0.1
         span = doc[start:end]
         with doc.retokenize() as retokenizer:
             retokenizer.merge(span)
@@ -62,7 +60,6 @@ def label_sentiment(matcher, doc, i, matches):
     matcher = Matcher(en_vocab)
     matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
     matcher(doc)
-    assert doc.sentiment != 0
     assert doc[1].norm_ == "happy emoji"
 
 
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 5e8975ed337..9fb6a72c87f 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -48,8 +48,6 @@ cdef class Doc:
 
     cdef TokenC* c
 
-    cdef public float sentiment
-
     cdef public dict activations
 
     cdef public dict user_hooks
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 5fda6f2f789..97c3f69f430 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -40,7 +40,6 @@ class Doc:
     spans: SpanGroups
     max_length: int
     length: int
-    sentiment: float
     activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
     cats: Dict[str, float]
     user_hooks: Dict[str, Callable[..., Any]]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index ed9e4cd999d..9a272d04781 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -279,7 +279,6 @@ cdef class Doc:
         self.c = data_start + PADDING
         self.max_length = size
         self.length = 0
-        self.sentiment = 0.0
         self.cats = {}
         self.activations = {}
         self.user_hooks = {}
@@ -1318,7 +1317,6 @@ cdef class Doc:
         other.tensor = copy.deepcopy(self.tensor)
         other.cats = copy.deepcopy(self.cats)
         other.user_data = copy.deepcopy(self.user_data)
-        other.sentiment = self.sentiment
         other.has_unknown_spaces = self.has_unknown_spaces
         other.user_hooks = dict(self.user_hooks)
         other.user_token_hooks = dict(self.user_token_hooks)
@@ -1418,7 +1416,6 @@ cdef class Doc:
             "text": lambda: self.text,
             "array_head": lambda: array_head,
             "array_body": lambda: self.to_array(array_head),
-            "sentiment": lambda: self.sentiment,
             "tensor": lambda: self.tensor,
             "cats": lambda: self.cats,
             "spans": lambda: self.spans.to_bytes(),
@@ -1454,8 +1451,6 @@ cdef class Doc:
             for key, value in zip(user_data_keys, user_data_values):
                 self.user_data[key] = value
         cdef int i, start, end, has_space
-        if "sentiment" not in exclude and "sentiment" in msg:
-            self.sentiment = msg["sentiment"]
         if "tensor" not in exclude and "tensor" in msg:
             self.tensor = msg["tensor"]
         if "cats" not in exclude and "cats" in msg:
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index a6731d1c2d4..ae4a6209e7e 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -89,8 +89,6 @@ class Span:
     @property
     def tensor(self) -> FloatsXd: ...
     @property
-    def sentiment(self) -> float: ...
-    @property
     def text(self) -> str: ...
     @property
     def text_with_ws(self) -> str: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 191f3783e14..26b56748d32 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -594,16 +594,6 @@ cdef class Span:
             return None
         return self.doc.tensor[self.start : self.end]
 
-    @property
-    def sentiment(self):
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the span.
-        """
-        if "sentiment" in self.doc.user_span_hooks:
-            return self.doc.user_span_hooks["sentiment"](self)
-        else:
-            return sum([token.sentiment for token in self]) / len(self)
-
     @property
     def text(self):
         """RETURNS (str): The original verbatim text of the span."""
diff --git a/spacy/tokens/token.pyi b/spacy/tokens/token.pyi
index 435ace52707..5c3d4d0ba2b 100644
--- a/spacy/tokens/token.pyi
+++ b/spacy/tokens/token.pyi
@@ -78,8 +78,6 @@ class Token:
     @property
     def prob(self) -> float: ...
     @property
-    def sentiment(self) -> float: ...
-    @property
     def lang(self) -> int: ...
     @property
     def idx(self) -> int: ...
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 8daff95b705..0e192843ae0 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -309,14 +309,6 @@ cdef class Token:
         """RETURNS (float): Smoothed log probability estimate of token type."""
         return self.vocab[self.c.lex.orth].prob
 
-    @property
-    def sentiment(self):
-        """RETURNS (float): A scalar value indicating the positivity or
-            negativity of the token."""
-        if "sentiment" in self.doc.user_token_hooks:
-            return self.doc.user_token_hooks["sentiment"](self)
-        return self.vocab[self.c.lex.orth].sentiment
-
     @property
     def lang(self):
         """RETURNS (uint64): ID of the language of the parent document's
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index 310ce0dc88d..28757cbc45f 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -762,7 +762,6 @@ The L2 norm of the document's vector representation.
 | `user_data`                                | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                               |
 | `lang` <Tag variant="new">2.1</Tag>        | Language of the document's vocabulary. ~~int~~                                                                                                 |
 | `lang_` <Tag variant="new">2.1</Tag>       | Language of the document's vocabulary. ~~str~~                                                                                                 |
-| `sentiment`                                | The document's positivity/negativity score, if available. ~~float~~                                                                            |
 | `user_hooks`                               | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                      |
 | `user_token_hooks`                         | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                              |
 | `user_span_hooks`                          | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                               |
@@ -786,7 +785,6 @@ serialization by passing in the string names via the `exclude` argument.
 | Name               | Description                                   |
 | ------------------ | --------------------------------------------- |
 | `text`             | The value of the `Doc.text` attribute.        |
-| `sentiment`        | The value of the `Doc.sentiment` attribute.   |
 | `tensor`           | The value of the `Doc.tensor` attribute.      |
 | `user_data`        | The value of the `Doc.user_data` dictionary.  |
 | `user_data_keys`   | The keys of the `Doc.user_data` dictionary.   |
diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md
new file mode 100644
index 00000000000..db1aba7aaec
--- /dev/null
+++ b/website/docs/api/lexeme.md
@@ -0,0 +1,163 @@
+---
+title: Lexeme
+teaser: An entry in the vocabulary
+tag: class
+source: spacy/lexeme.pyx
+---
+
+A `Lexeme` has no string context – it's a word type, as opposed to a word token.
+It therefore has no part-of-speech tag, dependency parse, or lemma (if
+lemmatization depends on the part-of-speech tag).
+
+## Lexeme.\_\_init\_\_ {#init tag="method"}
+
+Create a `Lexeme` object.
+
+| Name    | Description                        |
+| ------- | ---------------------------------- |
+| `vocab` | The parent vocabulary. ~~Vocab~~   |
+| `orth`  | The orth id of the lexeme. ~~int~~ |
+
+## Lexeme.set_flag {#set_flag tag="method"}
+
+Change the value of a boolean flag.
+
+> #### Example
+>
+> ```python
+> COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
+> nlp.vocab["spaCy"].set_flag(COOL_FLAG, True)
+> ```
+
+| Name      | Description                                  |
+| --------- | -------------------------------------------- |
+| `flag_id` | The attribute ID of the flag to set. ~~int~~ |
+| `value`   | The new value of the flag. ~~bool~~          |
+
+## Lexeme.check_flag {#check_flag tag="method"}
+
+Check the value of a boolean flag.
+
+> #### Example
+>
+> ```python
+> is_my_library = lambda text: text in ["spaCy", "Thinc"]
+> MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
+> assert nlp.vocab["spaCy"].check_flag(MY_LIBRARY) == True
+> ```
+
+| Name        | Description                                    |
+| ----------- | ---------------------------------------------- |
+| `flag_id`   | The attribute ID of the flag to query. ~~int~~ |
+| **RETURNS** | The value of the flag. ~~bool~~                |
+
+## Lexeme.similarity {#similarity tag="method" model="vectors"}
+
+Compute a semantic similarity estimate. Defaults to cosine over vectors.
+
+> #### Example
+>
+> ```python
+> apple = nlp.vocab["apple"]
+> orange = nlp.vocab["orange"]
+> apple_orange = apple.similarity(orange)
+> orange_apple = orange.similarity(apple)
+> assert apple_orange == orange_apple
+> ```
+
+| Name        | Description                                                                                                                      |
+| ----------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| other       | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ |
+| **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~                                                                     |
+
+## Lexeme.has_vector {#has_vector tag="property" model="vectors"}
+
+A boolean value indicating whether a word vector is associated with the lexeme.
+
+> #### Example
+>
+> ```python
+> apple = nlp.vocab["apple"]
+> assert apple.has_vector
+> ```
+
+| Name        | Description                                             |
+| ----------- | ------------------------------------------------------- |
+| **RETURNS** | Whether the lexeme has a vector data attached. ~~bool~~ |
+
+## Lexeme.vector {#vector tag="property" model="vectors"}
+
+A real-valued meaning representation.
+
+> #### Example
+>
+> ```python
+> apple = nlp.vocab["apple"]
+> assert apple.vector.dtype == "float32"
+> assert apple.vector.shape == (300,)
+> ```
+
+| Name        | Description                                                                                      |
+| ----------- | ------------------------------------------------------------------------------------------------ |
+| **RETURNS** | A 1-dimensional array representing the lexeme's vector. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
+
+## Lexeme.vector_norm {#vector_norm tag="property" model="vectors"}
+
+The L2 norm of the lexeme's vector representation.
+
+> #### Example
+>
+> ```python
+> apple = nlp.vocab["apple"]
+> pasta = nlp.vocab["pasta"]
+> apple.vector_norm  # 7.1346845626831055
+> pasta.vector_norm  # 7.759851932525635
+> assert apple.vector_norm != pasta.vector_norm
+> ```
+
+| Name        | Description                                         |
+| ----------- | --------------------------------------------------- |
+| **RETURNS** | The L2 norm of the vector representation. ~~float~~ |
+
+## Attributes {#attributes}
+
+| Name                                         | Description                                                                                                                                                                                                                                                          |
+| -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                      | The lexeme's vocabulary. ~~Vocab~~                                                                                                                                                                                                                                   |
+| `text`                                       | Verbatim text content. ~~str~~                                                                                                                                                                                                                                       |
+| `orth`                                       | ID of the verbatim text content. ~~int~~                                                                                                                                                                                                                             |
+| `orth_`                                      | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. ~~str~~                                                                                                                                                 |
+| `rank`                                       | Sequential ID of the lexeme's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                |
+| `flags`                                      | Container of the lexeme's binary flags. ~~int~~                                                                                                                                                                                                                      |
+| `norm`                                       | The lexeme's norm, i.e. a normalized form of the lexeme text. ~~int~~                                                                                                                                                                                                |
+| `norm_`                                      | The lexeme's norm, i.e. a normalized form of the lexeme text. ~~str~~                                                                                                                                                                                                |
+| `lower`                                      | Lowercase form of the word. ~~int~~                                                                                                                                                                                                                                  |
+| `lower_`                                     | Lowercase form of the word. ~~str~~                                                                                                                                                                                                                                  |
+| `shape`                                      | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
+| `shape_`                                     | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
+| `prefix`                                     | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~                                                                                                                                                                                            |
+| `prefix_`                                    | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~                                                                                                                                                                                            |
+| `suffix`                                     | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~                                                                                                                                                                                              |
+| `suffix_`                                    | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~                                                                                                                                                                                            |
+| `is_alpha`                                   | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~                                                                                                                                                                    |
+| `is_ascii`                                   | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~                                                                                                                                                     |
+| `is_digit`                                   | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~                                                                                                                                                                                   |
+| `is_lower`                                   | Is the lexeme in lowercase? Equivalent to `lexeme.text.islower()`. ~~bool~~                                                                                                                                                                                          |
+| `is_upper`                                   | Is the lexeme in uppercase? Equivalent to `lexeme.text.isupper()`. ~~bool~~                                                                                                                                                                                          |
+| `is_title`                                   | Is the lexeme in titlecase? Equivalent to `lexeme.text.istitle()`. ~~bool~~                                                                                                                                                                                          |
+| `is_punct`                                   | Is the lexeme punctuation? ~~bool~~                                                                                                                                                                                                                                  |
+| `is_left_punct`                              | Is the lexeme a left punctuation mark, e.g. `(`? ~~bool~~                                                                                                                                                                                                            |
+| `is_right_punct`                             | Is the lexeme a right punctuation mark, e.g. `)`? ~~bool~~                                                                                                                                                                                                           |
+| `is_space`                                   | Does the lexeme consist of whitespace characters? Equivalent to `lexeme.text.isspace()`. ~~bool~~                                                                                                                                                                    |
+| `is_bracket`                                 | Is the lexeme a bracket? ~~bool~~                                                                                                                                                                                                                                    |
+| `is_quote`                                   | Is the lexeme a quotation mark? ~~bool~~                                                                                                                                                                                                                             |
+| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the lexeme a currency symbol? ~~bool~~                                                                                                                                                                                                                            |
+| `like_url`                                   | Does the lexeme resemble a URL? ~~bool~~                                                                                                                                                                                                                             |
+| `like_num`                                   | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~                                                                                                                                                                                          |
+| `like_email`                                 | Does the lexeme resemble an email address? ~~bool~~                                                                                                                                                                                                                  |
+| `is_oov`                                     | Is the lexeme out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~                                                                                                                                                                                      |
+| `is_stop`                                    | Is the lexeme part of a "stop list"? ~~bool~~                                                                                                                                                                                                                        |
+| `lang`                                       | Language of the parent vocabulary. ~~int~~                                                                                                                                                                                                                           |
+| `lang_`                                      | Language of the parent vocabulary. ~~str~~                                                                                                                                                                                                                           |
+| `prob`                                       | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                 |
+| `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index 5e7495f17ca..1774a298ff2 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -568,5 +568,4 @@ overlaps with will be returned.
 | `ent_id_`                               | Alias for `id_`: the span's ID. ~~str~~                                                                                       |
 | `id`                                    | The hash value of the span's ID. ~~int~~                                                                                      |
 | `id_`                                   | The span's ID. ~~str~~                                                                                                        |
-| `sentiment`                             | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
 | `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
diff --git a/website/docs/api/token.mdx b/website/docs/api/token.mdx
index 12b99394350..16d421c12f4 100644
--- a/website/docs/api/token.mdx
+++ b/website/docs/api/token.mdx
@@ -470,7 +470,6 @@ The L2 norm of the token's vector representation.
 | `lang_`                                      | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
 | `prob`                                       | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
 | `idx`                                        | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
-| `sentiment`                                  | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
 | `lex_id`                                     | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `rank`                                       | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index ec93aee2cf3..c0fc4207046 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -1388,7 +1388,7 @@ separation and makes it easier to ensure backwards compatibility. For example,
 if you've implemented your own `.coref` property and spaCy claims it one day,
 it'll break your code. Similarly, just by looking at the code, you'll
 immediately know what's built-in and what's custom – for example,
-`doc.sentiment` is spaCy, while `doc._.sent_score` isn't.
+`doc.lang` is spaCy, while `doc._.language` isn't.
 
 </Accordion>
 
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 86220440991..8469d587ed1 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -811,6 +811,9 @@ whitespace, making them easy to match as well.
 ```python {executable="true"}
 from spacy.lang.en import English
 from spacy.matcher import Matcher
+from spacy.tokens import Doc
+
+Doc.set_extension("sentiment", default=0.0)
 
 nlp = English()  # We only want the tokenizer, so no need to load a pipeline
 matcher = Matcher(nlp.vocab)
@@ -826,9 +829,9 @@ neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]
 def label_sentiment(matcher, doc, i, matches):
     match_id, start, end = matches[i]
     if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
-        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
+        doc._.sentiment += 0.1  # Add 0.1 for positive sentiment
     elif doc.vocab.strings[match_id] == "SAD":
-        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
+        doc._.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
 
 matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)  # Add positive pattern
 matcher.add("SAD", neg_patterns, on_match=label_sentiment)  # Add negative pattern
@@ -857,17 +860,18 @@ is "Smiling Face With Heart-Eyes". Assigning it to a
 the emoji span will make it available as `span._.emoji_desc`.
 
 ```python
-import emoji  # Installation: pip install emoji
-from spacy.tokens import Span  # Get the global Span object
+from emojipedia import Emojipedia  # Installation: pip install emojipedia
+from spacy.tokens import Doc, Span  # Get the global Doc and Span object
 
 Span.set_extension("emoji_desc", default=None)  # Register the custom attribute
+Doc.set_extension("sentiment", default=0.0)
 
 def label_sentiment(matcher, doc, i, matches):
     match_id, start, end = matches[i]
     if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
-        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
+        doc._.sentiment += 0.1  # Add 0.1 for positive sentiment
     elif doc.vocab.strings[match_id] == "SAD":
-        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
+        doc._.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
     span = doc[start:end]
     # Verify if it is an emoji and set the extension attribute correctly.
     if emoji.is_emoji(span[0].text):

From a393b4cd5080de3aaffaf99f4476f63c74be66c4 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 5 Dec 2022 08:57:24 +0100
Subject: [PATCH 244/504] prettier formatting

---
 website/docs/api/cli.mdx                    | 30 ++++++++++-----------
 website/docs/usage/processing-pipelines.mdx |  4 +--
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 950d98c1f68..47028f4a2e7 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1343,21 +1343,21 @@ be provided.
 > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
 > ```
 
-| Name                     | Description                                                                                                                                                                          |
-| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                  | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`              | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
-| `pipe_name`              | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
-| `threshold_key`          | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
-| `scores_key`             | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
-| `--n_trials`, `-n`       | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
-| `--code`, `-c`           | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gpu-id`, `-g`         | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--gold-preproc`, `-G`   | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                     |
-| `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-
-## assemble {id="assemble",tag="command"}
+| Name                    | Description                                                                                                                                                                          |
+| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                 | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
+| `data_path`             | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
+| `pipe_name`             | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
+| `threshold_key`         | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
+| `scores_key`            | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
+| `--n_trials`, `-n`      | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
+| `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gpu-id`, `-g`        | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--gold-preproc`, `-G`  | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
+| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+
+## assemble {#assemble tag="command"}
 
 Assemble a pipeline from a config file without additional training. Expects a
 [config file](/api/data-formats#config) with all settings and hyperparameters.
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index c0fc4207046..fb5de5da102 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -1387,8 +1387,8 @@ Writing to a `._` attribute instead of to the `Doc` directly keeps a clearer
 separation and makes it easier to ensure backwards compatibility. For example,
 if you've implemented your own `.coref` property and spaCy claims it one day,
 it'll break your code. Similarly, just by looking at the code, you'll
-immediately know what's built-in and what's custom – for example,
-`doc.lang` is spaCy, while `doc._.language` isn't.
+immediately know what's built-in and what's custom – for example, `doc.lang` is
+spaCy, while `doc._.language` isn't.
 
 </Accordion>
 

From f3c3870f7152d7f08fa22beb875798b89ca6396e Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 5 Dec 2022 17:43:23 +0900
Subject: [PATCH 245/504] Switch ubuntu-latest to ubuntu-20.04 in main tests
 (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6
---
 azure-pipelines.yml | 103 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 azure-pipelines.yml

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
new file mode 100644
index 00000000000..0f7ea91f96f
--- /dev/null
+++ b/azure-pipelines.yml
@@ -0,0 +1,103 @@
+trigger:
+  batch: true
+  branches:
+    include:
+      - "*"
+    exclude:
+      - "spacy.io"
+      - "nightly.spacy.io"
+      - "v2.spacy.io"
+  paths:
+    exclude:
+      - "website/*"
+      - "*.md"
+      - ".github/workflows/*"
+pr:
+  paths:
+    exclude:
+      - "*.md"
+      - "website/docs/*"
+      - "website/src/*"
+      - ".github/workflows/*"
+
+jobs:
+  # Perform basic checks for most important errors (syntax etc.) Uses the config
+  # defined in .flake8 and overwrites the selected codes.
+  - job: "Validate"
+    pool:
+      vmImage: "ubuntu-latest"
+    steps:
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: "3.7"
+      - script: |
+          pip install flake8==5.0.4
+          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
+        displayName: "flake8"
+
+  - job: "Test"
+    dependsOn: "Validate"
+    strategy:
+      matrix:
+        # We're only running one platform per Python version to speed up builds
+        Python36Linux:
+          imageName: "ubuntu-20.04"
+          python.version: "3.6"
+        #        Python36Windows:
+        #          imageName: "windows-latest"
+        #          python.version: "3.6"
+        #        Python36Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.6"
+        #        Python37Linux:
+        #          imageName: "ubuntu-20.04"
+        #          python.version: "3.7"
+        Python37Windows:
+          imageName: "windows-latest"
+          python.version: "3.7"
+        #        Python37Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.7"
+        #        Python38Linux:
+        #          imageName: "ubuntu-latest"
+        #          python.version: "3.8"
+        #        Python38Windows:
+        #          imageName: "windows-latest"
+        #          python.version: "3.8"
+        Python38Mac:
+          imageName: "macos-latest"
+          python.version: "3.8"
+        Python39Linux:
+          imageName: "ubuntu-latest"
+          python.version: "3.9"
+        #        Python39Windows:
+        #          imageName: "windows-latest"
+        #          python.version: "3.9"
+        #        Python39Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.9"
+        #        Python310Linux:
+        #          imageName: "ubuntu-latest"
+        #          python.version: "3.10"
+        Python310Windows:
+          imageName: "windows-latest"
+          python.version: "3.10"
+        #        Python310Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.10"
+        Python311Linux:
+          imageName: 'ubuntu-latest'
+          python.version: '3.11'
+        Python311Windows:
+          imageName: 'windows-latest'
+          python.version: '3.11'
+        Python311Mac:
+          imageName: 'macos-latest'
+          python.version: '3.11'
+      maxParallel: 4
+    pool:
+      vmImage: $(imageName)
+    steps:
+      - template: .github/azure-steps.yml
+        parameters:
+          python_version: '$(python.version)'

From 7e7599485a21f9efd516172041db7e11f678eb20 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 8 Dec 2022 19:43:52 +0900
Subject: [PATCH 246/504] Remove all references to "begin_training" (#11943)

When v3 was released, `begin_training` was renamed to `initialize`.
There were warnings in the code and docs about that. This PR removes
them.
---
 spacy/errors.py                           |  7 -------
 spacy/language.py                         |  9 ---------
 spacy/pipeline/pipe.pyx                   |  7 -------
 spacy/tests/pipeline/test_pipe_methods.py | 11 -----------
 website/docs/api/dependencyparser.mdx     |  6 ------
 website/docs/api/entitylinker.mdx         |  6 ------
 website/docs/api/entityrecognizer.mdx     |  6 ------
 website/docs/api/language.mdx             |  9 ---------
 website/docs/api/pipe.mdx                 |  6 ------
 website/docs/api/tagger.mdx               |  6 ------
 website/docs/api/textcategorizer.mdx      |  6 ------
 11 files changed, 79 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 965c92066bc..454e71f987c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -132,13 +132,6 @@ class Warnings(metaclass=ErrorsWithCodes):
             "and make it independent. For example, `replace_listeners = "
             "[\"model.tok2vec\"]` See the documentation for details: "
             "https://spacy.io/usage/training#config-components-listeners")
-    W088 = ("The pipeline component {name} implements a `begin_training` "
-            "method, which won't be called by spaCy. As of v3.0, `begin_training` "
-            "has been renamed to `initialize`, so you likely want to rename the "
-            "component method. See the documentation for details: "
-            "https://spacy.io/api/language#initialize")
-    W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed "
-            "to `nlp.initialize`.")
     W090 = ("Could not locate any {format} files in path '{path}'.")
     W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
     W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
diff --git a/spacy/language.py b/spacy/language.py
index 18d20c93932..a47cc5df454 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1277,15 +1277,6 @@ def get_grads(key, W, dW):
             sgd(key, W, dW)  # type: ignore[call-arg, misc]
         return losses
 
-    def begin_training(
-        self,
-        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
-        *,
-        sgd: Optional[Optimizer] = None,
-    ) -> Optimizer:
-        warnings.warn(Warnings.W089, DeprecationWarning)
-        return self.initialize(get_examples, sgd=sgd)
-
     def initialize(
         self,
         get_examples: Optional[Callable[[], Iterable[Example]]] = None,
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 72ea7e45a80..ea5fc5253d9 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -21,13 +21,6 @@ cdef class Pipe:
     DOCS: https://spacy.io/api/pipe
     """
 
-    @classmethod
-    def __init_subclass__(cls, **kwargs):
-        """Raise a warning if an inheriting class implements 'begin_training'
-         (from v2) instead of the new 'initialize' method (from v3)"""
-        if hasattr(cls, "begin_training"):
-            warnings.warn(Warnings.W088.format(name=cls.__name__))
-
     def __call__(self, Doc doc) -> Doc:
         """Apply the pipe to one document. The document is modified in place,
         and returned. This usually happens under the hood when the nlp object
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 4dd7bae16c2..9b9786f0458 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -529,17 +529,6 @@ def test_pipe_label_data_no_labels(pipe):
         assert "labels" not in get_arg_names(initialize)
 
 
-def test_warning_pipe_begin_training():
-    with pytest.warns(UserWarning, match="begin_training"):
-
-        class IncompatPipe(TrainablePipe):
-            def __init__(self):
-                ...
-
-            def begin_training(*args, **kwargs):
-                ...
-
-
 def test_pipe_methods_initialize():
     """Test that the [initialize] config reflects the components correctly."""
     nlp = Language()
diff --git a/website/docs/api/dependencyparser.mdx b/website/docs/api/dependencyparser.mdx
index a6bc48cdf74..771a00aeee1 100644
--- a/website/docs/api/dependencyparser.mdx
+++ b/website/docs/api/dependencyparser.mdx
@@ -169,12 +169,6 @@ arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index 85b872151fd..238b62a2e6d 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -200,12 +200,6 @@ knowledge base. This argument should be a function that takes a `Vocab` instance
 and creates the `KnowledgeBase`, ensuring that the strings of the knowledge base
 are synced with the current vocab.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/entityrecognizer.mdx b/website/docs/api/entityrecognizer.mdx
index c80406a5b81..1f386bbb6ff 100644
--- a/website/docs/api/entityrecognizer.mdx
+++ b/website/docs/api/entityrecognizer.mdx
@@ -165,12 +165,6 @@ arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 068e8ea7885..d5fbae05ec4 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -259,15 +259,6 @@ either in the [config](/usage/training#config), or by calling
 [`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
 the tagger or textcat).
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`. It now also takes a
-**function** that is called with no arguments and returns a sequence of
-[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse`
-objects.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx
index c2777edf07e..b387ea58654 100644
--- a/website/docs/api/pipe.mdx
+++ b/website/docs/api/pipe.mdx
@@ -152,12 +152,6 @@ network,
 setting up the label scheme based on the data. This method is typically called
 by [`Language.initialize`](/api/language#initialize).
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index 20852e8eb94..ae14df212ee 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -142,12 +142,6 @@ arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/textcategorizer.mdx b/website/docs/api/textcategorizer.mdx
index a1dfb6dd88e..5db3a409255 100644
--- a/website/docs/api/textcategorizer.mdx
+++ b/website/docs/api/textcategorizer.mdx
@@ -187,12 +187,6 @@ arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.
 
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
 > #### Example
 >
 > ```python

From 8364156b8c9d0cefdaeea771f6f6d453e5c869d3 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 8 Dec 2022 19:45:52 +0900
Subject: [PATCH 247/504] Remove old model shortcuts (#11916)

* Remove old model shortcuts

* Remove error, docs warnings about shortcuts

* Fix import in util

Accidentally deleted the whole import and not just the old part...

* Change universe example to v3 style

* Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6

* Update some model loading in Universe

* Add v2 tag to neuralcoref

* Use the spacy-version feature instead of a v2 tag

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/cli/download.py         | 18 ++----------------
 spacy/errors.py               | 16 ----------------
 spacy/util.py                 |  4 +---
 website/UNIVERSE.md           |  2 +-
 website/docs/usage/models.mdx | 29 +----------------------------
 website/meta/universe.json    | 20 ++++++++++++--------
 6 files changed, 17 insertions(+), 72 deletions(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 4261fb830d9..f371d110319 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,16 +7,8 @@
 from wasabi import msg
 
 from .. import about
-from ..errors import OLD_MODEL_SHORTCUTS
-from ..util import (
-    get_minor_version,
-    is_in_interactive,
-    is_in_jupyter,
-    is_package,
-    is_prerelease_version,
-    run_command,
-)
-from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
+from ..util import is_package, get_minor_version, run_command
+from ..util import is_prerelease_version
 
 
 @app.command(
@@ -76,12 +68,6 @@ def download(
         version = components[-1]
     else:
         model_name = model
-        if model in OLD_MODEL_SHORTCUTS:
-            msg.warn(
-                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please "
-                f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
-            )
-            model_name = OLD_MODEL_SHORTCUTS[model]
         compatibility = get_compatibility()
         version = get_version(model_name, compatibility)
 
diff --git a/spacy/errors.py b/spacy/errors.py
index 454e71f987c..5f03d0eae94 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -732,13 +732,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "method in component '{name}'. If you want to use this "
             "method, make sure it's overwritten on the subclass.")
     E940 = ("Found NaN values in scores.")
-    E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
-            "model from a shortcut, which is obsolete as of spaCy v3.0. To "
-            "load the model, use its full name instead:\n\n"
-            "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
-            "models, see the models directory: https://spacy.io/models and if "
-            "you want to create a blank model, use spacy.blank: "
-            "nlp = spacy.blank(\"{name}\")")
     E942 = ("Executing `after_{name}` callback failed. Expected the function to "
             "return an initialized nlp object but got: {value}. Maybe "
             "you forgot to return the modified object in your function?")
@@ -986,15 +979,6 @@ class Errors(metaclass=ErrorsWithCodes):
              "but got '{received_type}'")
 
 
-# Deprecated model shortcuts, only used in errors and warnings
-OLD_MODEL_SHORTCUTS = {
-    "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
-    "pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
-    "nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
-    "lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
-}
-
-
 # fmt: on
 
 
diff --git a/spacy/util.py b/spacy/util.py
index 8068c4bcec9..463ac219bf5 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -66,7 +66,7 @@
 
 from .symbols import ORTH
 from .compat import cupy, CudaStream, is_windows, importlib_metadata
-from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
+from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, importlib_metadata, is_windows
 from .errors import OLD_MODEL_SHORTCUTS, Errors, Warnings
@@ -465,8 +465,6 @@ def load_model(
             return load_model_from_path(Path(name), **kwargs)  # type: ignore[arg-type]
     elif hasattr(name, "exists"):  # Path or Path-like to model data
         return load_model_from_path(name, **kwargs)  # type: ignore[arg-type]
-    if name in OLD_MODEL_SHORTCUTS:
-        raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name]))  # type: ignore[index]
     raise IOError(Errors.E050.format(name=name))
 
 
diff --git a/website/UNIVERSE.md b/website/UNIVERSE.md
index ac4e2e684fb..a9008086c95 100644
--- a/website/UNIVERSE.md
+++ b/website/UNIVERSE.md
@@ -61,7 +61,7 @@ use a linter to verify that your markup is correct.
         "import spacy",
         "import package_name",
         "",
-        "nlp = spacy.load('en')",
+        "nlp = spacy.load('en_core_web_sm')",
         "nlp.add_pipe(package_name)"
     ],
     "code_language": "python",
diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx
index 9213dead16b..e74c37e3080 100644
--- a/website/docs/usage/models.mdx
+++ b/website/docs/usage/models.mdx
@@ -337,23 +337,7 @@ The easiest way to download a trained pipeline is via spaCy's
 [`download`](/api/cli#download) command. It takes care of finding the
 best-matching package compatible with your spaCy installation.
 
-> #### Important note for v3.0
->
-> Note that as of spaCy v3.0, shortcut links like `en` that create (potentially
-> brittle) symlinks in your spaCy installation are **deprecated**. To download
-> and load an installed pipeline package, use its full name:
->
-> ```diff
-> - python -m spacy download en
-> + python -m spacy download en_core_web_sm
-> ```
->
-> ```diff
-> - nlp = spacy.load("en")
-> + nlp = spacy.load("en_core_web_sm")
-> ```
-
-```bash
+```cli
 # Download best-matching version of a package for your spaCy installation
 $ python -m spacy download en_core_web_sm
 
@@ -483,17 +467,6 @@ spacy.cli.download("en_core_web_sm")
 To load a pipeline package, use [`spacy.load`](/api/top-level#spacy.load) with
 the package name or a path to the data directory:
 
-> #### Important note for v3.0
->
-> Note that as of spaCy v3.0, shortcut links like `en` that create (potentially
-> brittle) symlinks in your spaCy installation are **deprecated**. To download
-> and load an installed pipeline package, use its full name:
->
-> ```diff
-> - python -m spacy download en
-> + python -m spacy download en_core_web_sm
-> ```
-
 ```python
 import spacy
 nlp = spacy.load("en_core_web_sm")           # load package "en_core_web_sm"
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 6278dd4899b..cb2386e1fb8 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1181,12 +1181,13 @@
             "author_links": {
                 "github": "mholtzscher"
             },
-            "category": ["pipeline"]
+            "category": ["pipeline"],
+            "spacy_version": 2
         },
         {
             "id": "spacy_cld",
             "title": "spaCy-CLD",
-            "slogan": "Add language detection to your spaCy pipeline using CLD2",
+            "slogan": "Add language detection to your spaCy v2 pipeline using CLD2",
             "description": "spaCy-CLD operates on `Doc` and `Span` spaCy objects. When called on a `Doc` or `Span`, the object is given two attributes: `languages` (a list of up to 3 language codes) and `language_scores` (a dictionary mapping language codes to confidence scores between 0 and 1).\n\nspacy-cld is a little extension that wraps the [PYCLD2](https://github.com/aboSamoor/pycld2) Python library, which in turn wraps the [Compact Language Detector 2](https://github.com/CLD2Owners/cld2) C library originally built at Google for the Chromium project. CLD2 uses character n-grams as features and a Naive Bayes classifier to identify 80+ languages from Unicode text strings (or XML/HTML). It can detect up to 3 different languages in a given document, and reports a confidence score (reported in with each language.",
             "github": "nickdavidhaynes/spacy-cld",
             "pip": "spacy_cld",
@@ -1206,7 +1207,8 @@
             "author_links": {
                 "github": "nickdavidhaynes"
             },
-            "category": ["pipeline"]
+            "category": ["pipeline"],
+            "spacy_version": 2
         },
         {
             "id": "spacy-iwnlp",
@@ -1280,7 +1282,8 @@
                 "github": "sammous"
             },
             "category": ["pipeline"],
-            "tags": ["pos", "lemmatizer", "french"]
+            "tags": ["pos", "lemmatizer", "french"],
+            "spacy_version": 2
         },
         {
             "id": "lemmy",
@@ -1474,8 +1477,8 @@
         },
         {
             "id": "neuralcoref",
-            "slogan": "State-of-the-art coreference resolution based on neural nets and spaCy",
-            "description": "This coreference resolution module is based on the super fast [spaCy](https://spacy.io/) parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. Since ✨Neuralcoref v2.0, you can train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**. Note that to use neuralcoref with spaCy > 2.1.0, you'll have to install neuralcoref from source.",
+            "slogan": "State-of-the-art coreference resolution based on neural nets and spaCy v2",
+            "description": "This coreference resolution module is based on the super fast spaCy parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. Since ✨Neuralcoref v2.0, you can train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**. Note that to use neuralcoref with spaCy > 2.1.0, you'll have to install neuralcoref from source, and v3+ is not supported.",
             "github": "huggingface/neuralcoref",
             "thumb": "https://i.imgur.com/j6FO9O6.jpg",
             "code_example": [
@@ -1496,7 +1499,8 @@
                 "github": "huggingface"
             },
             "category": ["standalone", "conversational", "models"],
-            "tags": ["coref"]
+            "tags": ["coref"],
+            "spacy_version": 2
         },
         {
             "id": "neuralcoref-vizualizer",
@@ -1572,7 +1576,7 @@
                 "import spacy",
                 "import explacy",
                 "",
-                "nlp = spacy.load('en')",
+                "nlp = spacy.load('en_core_web_sm')",
                 "explacy.print_parse_info(nlp, 'The salad was surprisingly tasty.')"
             ],
             "author": "Tyler Neylon",

From 5ff2b8cf5132ca87abdf4f3c30385eadc1fcf99f Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 8 Dec 2022 13:24:45 +0100
Subject: [PATCH 248/504] Remove unused, experimental multi-task components
 (#11919)

* Remove experimental multi-task components

These are incomplete implementations and are not usable in their current state.

* Remove orphaned error message

* Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6

* Revert "Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)"

This reverts commit 77c0fd7b176be80e8438fa21440a85d1fe26e39b.

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
---
 setup.py                     |   1 -
 spacy/errors.py              |   2 -
 spacy/pipeline/multitask.pyx | 215 -----------------------------------
 3 files changed, 218 deletions(-)
 delete mode 100644 spacy/pipeline/multitask.pyx

diff --git a/setup.py b/setup.py
index c9b4f7171e3..a80016ea9ea 100755
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,6 @@
     "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
-    "spacy.pipeline.multitask",
     "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
diff --git a/spacy/errors.py b/spacy/errors.py
index 5f03d0eae94..11b8980fd9d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -249,8 +249,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "https://spacy.io/usage/models")
     E011 = ("Unknown operator: '{op}'. Options: {opts}")
     E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
-    E016 = ("MultitaskObjective target should be function or one of: dep, "
-            "tag, ent, dep_tag_offset, ent_tag.")
     E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
     E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
             "refers to an issue with the `Vocab` or `StringStore`.")
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
deleted file mode 100644
index f33a90fde85..00000000000
--- a/spacy/pipeline/multitask.pyx
+++ /dev/null
@@ -1,215 +0,0 @@
-# cython: infer_types=True, binding=True
-from typing import Optional
-
-import numpy
-from thinc.api import Config, CosineDistance, Model, set_dropout_rate, to_categorical
-
-from ..attrs import ID
-from ..errors import Errors
-from ..language import Language
-from ..training import validate_examples
-from .tagger import Tagger
-from .trainable_pipe import TrainablePipe
-
-default_model_config = """
-[model]
-@architectures = "spacy.MultiTask.v1"
-maxout_pieces = 3
-token_vector_width = 96
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v2"
-pretrained_vectors = null
-width = 96
-depth = 4
-embed_size = 2000
-window_size = 1
-maxout_pieces = 2
-subword_features = true
-"""
-DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
-
-
-@Language.factory(
-    "nn_labeller",
-    default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
-)
-def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
-    return MultitaskObjective(nlp.vocab, model, name)
-
-
-class MultitaskObjective(Tagger):
-    """Experimental: Assist training of a parser or tagger, by training a
-    side-objective.
-    """
-
-    def __init__(self, vocab, model, name="nn_labeller", *, target):
-        self.vocab = vocab
-        self.model = model
-        self.name = name
-        if target == "dep":
-            self.make_label = self.make_dep
-        elif target == "tag":
-            self.make_label = self.make_tag
-        elif target == "ent":
-            self.make_label = self.make_ent
-        elif target == "dep_tag_offset":
-            self.make_label = self.make_dep_tag_offset
-        elif target == "ent_tag":
-            self.make_label = self.make_ent_tag
-        elif target == "sent_start":
-            self.make_label = self.make_sent_start
-        elif hasattr(target, "__call__"):
-            self.make_label = target
-        else:
-            raise ValueError(Errors.E016)
-        cfg = {"labels": {}, "target": target}
-        self.cfg = dict(cfg)
-
-    @property
-    def labels(self):
-        return self.cfg.setdefault("labels", {})
-
-    @labels.setter
-    def labels(self, value):
-        self.cfg["labels"] = value
-
-    def set_annotations(self, docs, dep_ids):
-        pass
-
-    def initialize(self, get_examples, nlp=None, labels=None):
-        if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
-            raise ValueError(err)
-        if labels is not None:
-            self.labels = labels
-        else:
-            for example in get_examples():
-                for token in example.y:
-                    label = self.make_label(token)
-                    if label is not None and label not in self.labels:
-                        self.labels[label] = len(self.labels)
-        self.model.initialize()   # TODO: fix initialization by defining X and Y
-
-    def predict(self, docs):
-        tokvecs = self.model.get_ref("tok2vec")(docs)
-        scores = self.model.get_ref("softmax")(tokvecs)
-        return tokvecs, scores
-
-    def get_loss(self, examples, scores):
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        for i, eg in enumerate(examples):
-            # Handles alignment for tokenization differences
-            _doc_annots = eg.get_aligned()  # TODO
-            for j in range(len(eg.predicted)):
-                tok_annots = {key: values[j] for key, values in tok_annots.items()}
-                label = self.make_label(j, tok_annots)
-                if label is None or label not in self.labels:
-                    correct[idx] = guesses[idx]
-                else:
-                    correct[idx] = self.labels[label]
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        loss = (d_scores**2).sum()
-        return float(loss), d_scores
-
-    @staticmethod
-    def make_dep(token):
-        return token.dep_
-
-    @staticmethod
-    def make_tag(token):
-        return token.tag_
-
-    @staticmethod
-    def make_ent(token):
-        if token.ent_iob_ == "O":
-            return "O"
-        else:
-            return token.ent_iob_ + "-" + token.ent_type_
-
-    @staticmethod
-    def make_dep_tag_offset(token):
-        dep = token.dep_
-        tag = token.tag_
-        offset = token.head.i - token.i
-        offset = min(offset, 2)
-        offset = max(offset, -2)
-        return f"{dep}-{tag}:{offset}"
-
-    @staticmethod
-    def make_ent_tag(token):
-        if token.ent_iob_ == "O":
-            ent = "O"
-        else:
-            ent = token.ent_iob_ + "-" + token.ent_type_
-        tag = token.tag_
-        return f"{tag}-{ent}"
-
-    @staticmethod
-    def make_sent_start(token):
-        """A multi-task objective for representing sentence boundaries,
-        using BILU scheme. (O is impossible)
-        """
-        if token.is_sent_start and token.is_sent_end:
-            return "U-SENT"
-        elif token.is_sent_start:
-            return "B-SENT"
-        else:
-            return "I-SENT"
-
-
-class ClozeMultitask(TrainablePipe):
-    def __init__(self, vocab, model, **cfg):
-        self.vocab = vocab
-        self.model = model
-        self.cfg = cfg
-        self.distance = CosineDistance(ignore_zeros=True, normalize=False)  # TODO: in config
-
-    def set_annotations(self, docs, dep_ids):
-        pass
-
-    def initialize(self, get_examples, nlp=None):
-        self.model.initialize()  # TODO: fix initialization by defining X and Y
-        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
-        self.model.output_layer.initialize(X)
-
-    def predict(self, docs):
-        tokvecs = self.model.get_ref("tok2vec")(docs)
-        vectors = self.model.get_ref("output_layer")(tokvecs)
-        return tokvecs, vectors
-
-    def get_loss(self, examples, vectors, prediction):
-        validate_examples(examples, "ClozeMultitask.get_loss")
-        # The simplest way to implement this would be to vstack the
-        # token.vector values, but that's a bit inefficient, especially on GPU.
-        # Instead we fetch the index into the vectors table for each of our tokens,
-        # and look them up all at once. This prevents data copying.
-        ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
-        target = vectors[ids]
-        gradient = self.distance.get_grad(prediction, target)
-        loss = self.distance.get_loss(prediction, target)
-        return float(loss), gradient
-
-    def update(self, examples, *, drop=0., sgd=None, losses=None):
-        pass
-
-    def rehearse(self, examples, drop=0., sgd=None, losses=None):
-        if losses is not None and self.name not in losses:
-            losses[self.name] = 0.
-        set_dropout_rate(self.model, drop)
-        validate_examples(examples, "ClozeMultitask.rehearse")
-        predictions, bp_predictions = self.model.begin_update()
-        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
-        bp_predictions(d_predictions)
-        if sgd is not None:
-            self.finish_update(sgd)
-        if losses is not None:
-            losses[self.name] += loss
-        return losses
-
-    def add_label(self, label):
-        raise NotImplementedError

From eb61f3d6dd97474bd21c7bc6e57ea75db3b59111 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 12 Dec 2022 08:55:53 +0100
Subject: [PATCH 249/504] Custom extensions for spans with equal boundaries
 (#11429)

* Init

* Fix return type for mypy

* adjust types and improve setting new attributes

* Add underscore changes to json conversion

* Add test and underscore changes to from_docs

* add underscore changes and test to span.to_doc

* update return values

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Add types to function

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* adjust formatting

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* shorten return type

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* add helper function to improve readability

* Improve code and add comments

* rerun azure tests

* Fix tests for json conversion

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tests/doc/test_underscore.py | 119 +++++++++++++++++++++++++++++
 spacy/tokens/doc.pyx               |  30 ++++++--
 spacy/tokens/span.pyx              |  38 +++++++--
 spacy/tokens/underscore.py         |  44 ++++++++++-
 4 files changed, 214 insertions(+), 17 deletions(-)

diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index b79d2f01f41..ca5c2ad3959 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -4,6 +4,10 @@
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
+# Helper functions
+def _get_tuple(s: Span):
+    return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
+
 
 @pytest.fixture(scope="function", autouse=True)
 def clean_underscore():
@@ -172,3 +176,118 @@ def test_method(doc, arg1=1, arg2=2):
     doc = Doc(en_vocab, words=["hello", "world"])
     assert test_method.__doc__ == "I am a docstring"
     assert doc._.test_docstrings.__doc__.rsplit(". ")[-1] == "I am a docstring"
+
+
+def test_underscore_for_unique_span(en_tokenizer):
+    """Test that spans with the same boundaries but with different labels are uniquely identified (see #9706)."""
+    Doc.set_extension(name="doc_extension", default=None)
+    Span.set_extension(name="span_extension", default=None)
+    Token.set_extension(name="token_extension", default=None)
+
+    # Initialize doc
+    text = "Hello, world!"
+    doc = en_tokenizer(text)
+    span_1 = Span(doc, 0, 2, "SPAN_1")
+    span_2 = Span(doc, 0, 2, "SPAN_2")
+
+    # Set custom extensions
+    doc._.doc_extension = "doc extension"
+    doc[0]._.token_extension = "token extension"
+    span_1._.span_extension = "span_1 extension"
+    span_2._.span_extension = "span_2 extension"
+
+    # Assert extensions
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
+
+    # Change label of span and assert extensions
+    span_1.label_ = "NEW_LABEL"
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
+
+    # Change KB_ID and assert extensions
+    span_1.kb_id_ = "KB_ID"
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
+
+    # Change extensions and assert
+    span_2._.span_extension = "updated span_2 extension"
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
+
+    # Change span ID and assert extensions
+    span_2.id = 2
+    assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
+
+    # Assert extensions with original key
+    assert doc.user_data[("._.", "doc_extension", None, None)] == "doc extension"
+    assert doc.user_data[("._.", "token_extension", 0, None)] == "token extension"
+
+
+def test_underscore_for_unique_span_from_docs(en_tokenizer):
+    """Test that spans in the user_data keep the same data structure when using Doc.from_docs"""
+    Span.set_extension(name="span_extension", default=None)
+    Token.set_extension(name="token_extension", default=None)
+
+    # Initialize doc
+    text_1 = "Hello, world!"
+    doc_1 = en_tokenizer(text_1)
+    span_1a = Span(doc_1, 0, 2, "SPAN_1a")
+    span_1b = Span(doc_1, 0, 2, "SPAN_1b")
+
+    text_2 = "This is a test."
+    doc_2 = en_tokenizer(text_2)
+    span_2a = Span(doc_2, 0, 3, "SPAN_2a")
+
+    # Set custom extensions
+    doc_1[0]._.token_extension = "token_1"
+    doc_2[1]._.token_extension = "token_2"
+    span_1a._.span_extension = "span_1a extension"
+    span_1b._.span_extension = "span_1b extension"
+    span_2a._.span_extension = "span_2a extension"
+
+    doc = Doc.from_docs([doc_1, doc_2])
+    # Assert extensions
+    assert doc_1.user_data[_get_tuple(span_1a)] == "span_1a extension"
+    assert doc_1.user_data[_get_tuple(span_1b)] == "span_1b extension"
+    assert doc_2.user_data[_get_tuple(span_2a)] == "span_2a extension"
+
+    # Check extensions on merged doc
+    assert doc.user_data[_get_tuple(span_1a)] == "span_1a extension"
+    assert doc.user_data[_get_tuple(span_1b)] == "span_1b extension"
+    assert (
+        doc.user_data[
+            (
+                "._.",
+                "span_extension",
+                span_2a.start_char + len(doc_1.text) + 1,
+                span_2a.end_char + len(doc_1.text) + 1,
+                span_2a.label,
+                span_2a.kb_id,
+                span_2a.id,
+            )
+        ]
+        == "span_2a extension"
+    )
+
+
+def test_underscore_for_unique_span_as_span(en_tokenizer):
+    """Test that spans in the user_data keep the same data structure when using Span.as_doc"""
+    Span.set_extension(name="span_extension", default=None)
+
+    # Initialize doc
+    text = "Hello, world!"
+    doc = en_tokenizer(text)
+    span_1 = Span(doc, 0, 2, "SPAN_1")
+    span_2 = Span(doc, 0, 2, "SPAN_2")
+
+    # Set custom extensions
+    span_1._.span_extension = "span_1 extension"
+    span_2._.span_extension = "span_2 extension"
+
+    span_doc = span_1.as_doc(copy_user_data=True)
+
+    # Assert extensions
+    assert span_doc.user_data[_get_tuple(span_1)] == "span_1 extension"
+    assert span_doc.user_data[_get_tuple(span_2)] == "span_2 extension"
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 9a272d04781..97b3f800464 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1225,13 +1225,22 @@ cdef class Doc:
 
             if "user_data" not in exclude:
                 for key, value in doc.user_data.items():
-                    if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.":
-                        data_type, name, start, end = key
+                    if isinstance(key, tuple) and len(key) >= 4 and key[0] == "._.":
+                        data_type = key[0]
+                        name = key[1]
+                        start = key[2]
+                        end = key[3]
                         if start is not None or end is not None:
                             start += char_offset
                             if end is not None:
                                 end += char_offset
-                            concat_user_data[(data_type, name, start, end)] = copy.copy(value)
+                                _label = key[4]
+                                _kb_id = key[5]
+                                _span_id = key[6]
+                                concat_user_data[(data_type, name, start, end, _label, _kb_id, _span_id)] = copy.copy(value)
+                            else:
+                                concat_user_data[(data_type, name, start, end)] = copy.copy(value)
+
                         else:
                             warnings.warn(Warnings.W101.format(name=name))
                     else:
@@ -1675,7 +1684,11 @@ cdef class Doc:
                 Span.set_extension(span_attr)
             for span_data in doc_json["underscore_span"][span_attr]:
                 value = span_data["value"]
-                self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value)
+                span = self.char_span(span_data["start"], span_data["end"])
+                span.label = span_data["label"]
+                span.kb_id = span_data["kb_id"]
+                span.id = span_data["id"]
+                span._.set(span_attr, value)
         return self
 
     def to_json(self, underscore=None):
@@ -1753,13 +1766,16 @@ cdef class Doc:
                                 if attr not in data["underscore_token"]:
                                     data["underscore_token"][attr] = []
                                 data["underscore_token"][attr].append({"start": start, "value": value})
-                            # Span attribute
-                            elif start is not None and end is not None:
+                            # Else span attribute
+                            elif end is not None:
+                                _label = data_key[4]
+                                _kb_id = data_key[5]
+                                _span_id = data_key[6]
                                 if "underscore_span" not in data:
                                     data["underscore_span"] = {}
                                 if attr not in data["underscore_span"]:
                                     data["underscore_span"][attr] = []
-                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value})
+                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id":_span_id})
 
             for attr in underscore:
                 if attr not in user_keys:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 26b56748d32..c4e4c3e5d39 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -230,11 +230,10 @@ cdef class Span:
         cdef SpanC* span_c = self.span_c()
         """Custom extension attributes registered via `set_extension`."""
         return Underscore(Underscore.span_extensions, self,
-                          start=span_c.start_char, end=span_c.end_char)
+                          start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
 
     def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None):
         """Create a `Doc` object with a copy of the `Span`'s data.
-
         copy_user_data (bool): Whether or not to copy the original doc's user data.
         array_head (tuple): `Doc` array attrs, can be passed in to speed up computation.
         array (ndarray): `Doc` as array, can be passed in to speed up computation.
@@ -287,12 +286,22 @@ cdef class Span:
             char_offset = self.start_char
             for key, value in self.doc.user_data.items():
                 if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.":
-                    data_type, name, start, end = key
+                    data_type = key[0]
+                    name = key[1]
+                    start = key[2]
+                    end = key[3]
                     if start is not None or end is not None:
                         start -= char_offset
+                        # Check if Span object
                         if end is not None:
                             end -= char_offset
-                        user_data[(data_type, name, start, end)] = copy.copy(value)
+                            _label = key[4]
+                            _kb_id = key[5]
+                            _span_id = key[6]
+                            user_data[(data_type, name, start, end, _label, _kb_id, _span_id)] = copy.copy(value)
+                        # Else Token object
+                        else:
+                            user_data[(data_type, name, start, end)] = copy.copy(value)
                 else:
                     user_data[key] = copy.copy(value)
             doc.user_data = user_data
@@ -815,21 +824,36 @@ cdef class Span:
             return self.span_c().label
 
         def __set__(self, attr_t label):
-            self.span_c().label = label
+            if label != self.span_c().label :
+                old_label = self.span_c().label
+                self.span_c().label = label
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id)
+                Underscore._replace_keys(old, new)
 
     property kb_id:
         def __get__(self):
             return self.span_c().kb_id
 
         def __set__(self, attr_t kb_id):
-            self.span_c().kb_id = kb_id
+            if kb_id != self.span_c().kb_id :
+                old_kb_id = self.span_c().kb_id
+                self.span_c().kb_id = kb_id
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id)
+                Underscore._replace_keys(old, new)
 
     property id:
         def __get__(self):
             return self.span_c().id
 
         def __set__(self, attr_t id):
-            self.span_c().id = id
+            if id != self.span_c().id :
+                old_id = self.span_c().id
+                self.span_c().id = id
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id)
+                Underscore._replace_keys(old, new)
 
     property ent_id:
         """Alias for the span's ID."""
diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index 0aa0c1e6d40..63706851286 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -3,10 +3,10 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 from ..errors import Errors
+from .span import Span
 
 if TYPE_CHECKING:
     from .doc import Doc
-    from .span import Span
     from .token import Token
 
 
@@ -26,6 +26,9 @@ def __init__(
         obj: Union["Doc", "Span", "Token"],
         start: Optional[int] = None,
         end: Optional[int] = None,
+        label: int = 0,
+        kb_id: int = 0,
+        span_id: int = 0,
     ):
         object.__setattr__(self, "_extensions", extensions)
         object.__setattr__(self, "_obj", obj)
@@ -37,6 +40,10 @@ def __init__(
         object.__setattr__(self, "_doc", obj.doc)
         object.__setattr__(self, "_start", start)
         object.__setattr__(self, "_end", end)
+        if type(obj) == Span:
+            object.__setattr__(self, "_label", label)
+            object.__setattr__(self, "_kb_id", kb_id)
+            object.__setattr__(self, "_span_id", span_id)
 
     def __dir__(self) -> List[str]:
         # Hack to enable autocomplete on custom extensions
@@ -89,8 +96,39 @@ def get(self, name: str) -> Any:
     def has(self, name: str) -> bool:
         return name in self._extensions
 
-    def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]:
-        return ("._.", name, self._start, self._end)
+    def _get_key(
+        self, name: str
+    ) -> Union[
+        Tuple[str, str, Optional[int], Optional[int]],
+        Tuple[str, str, Optional[int], Optional[int], int, int, int],
+    ]:
+        if hasattr(self, "_label"):
+            return (
+                "._.",
+                name,
+                self._start,
+                self._end,
+                self._label,
+                self._kb_id,
+                self._span_id,
+            )
+        else:
+            return "._.", name, self._start, self._end
+
+    @staticmethod
+    def _replace_keys(old_underscore: "Underscore", new_underscore: "Underscore"):
+        """
+        This function is called by Span when its kb_id or label are re-assigned.
+        It checks if any user_data is stored for this span and replaces the keys
+        """
+        for name in old_underscore._extensions:
+            old_key = old_underscore._get_key(name)
+            old_doc = old_underscore._doc
+            new_key = new_underscore._get_key(name)
+            if old_key != new_key and old_key in old_doc.user_data:
+                old_underscore._doc.user_data[
+                    new_key
+                ] = old_underscore._doc.user_data.pop(old_key)
 
     @classmethod
     def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]:

From 6cf72040b850ba01dd2b22e728c594aa86520cc6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Sat, 17 Dec 2022 14:32:19 +0100
Subject: [PATCH 250/504] Fix v4 branch to build against Thinc v9 (#11921)

* Move `thinc.extra.search` to `spacy.pipeline._parser_internals`

Backport of:
https://github.com/explosion/spaCy/pull/11317

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Replace references to `thinc.backends.linalg` with `CBlas`

Backport of:
https://github.com/explosion/spaCy/pull/11292

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Use cross entropy from `thinc.legacy`

* Require thinc>=9.0.0.dev0,<9.1.0

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 pyproject.toml                                |   5 +-
 requirements.txt                              |   2 +-
 setup.cfg                                     |   4 +-
 setup.py                                      |   2 +
 spacy/ml/parser_model.pyx                     |  26 +-
 .../_parser_internals/_beam_utils.pxd         |   3 +-
 .../_parser_internals/_beam_utils.pyx         |  12 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 +-
 spacy/pipeline/_parser_internals/ner.pyx      |   4 +-
 spacy/pipeline/_parser_internals/search.pxd   |  89 +++++
 spacy/pipeline/_parser_internals/search.pyx   | 306 ++++++++++++++++++
 spacy/pipeline/edit_tree_lemmatizer.py        |   7 +-
 spacy/pipeline/morphologizer.pyx              |   5 +-
 spacy/pipeline/senter.pyx                     |   6 +-
 spacy/pipeline/tagger.pyx                     |   7 +-
 spacy/pipeline/transition_parser.pyx          |  21 +-
 spacy/tests/conftest.py                       |  32 ++
 spacy/tests/parser/_search.pyx                | 119 +++++++
 spacy/tests/parser/test_search.py             |   3 +
 19 files changed, 606 insertions(+), 50 deletions(-)
 create mode 100644 spacy/pipeline/_parser_internals/search.pxd
 create mode 100644 spacy/pipeline/_parser_internals/search.pyx
 create mode 100644 spacy/tests/parser/_search.pyx
 create mode 100644 spacy/tests/parser/test_search.py

diff --git a/pyproject.toml b/pyproject.toml
index bfd7e68d1f7..77e1471426f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,9 +5,8 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.2.2,<8.3.0",
-    "numpy>=1.15.0; python_version < '3.9'",
-    "numpy>=1.25.0; python_version >= '3.9'",
+    "thinc>=9.0.0.dev0,<9.1.0",
+    "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index 54b8f22a17e..61c19d68014 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.2.2,<8.3.0
+thinc>=9.0.0.dev0,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index a5df23cc4ea..9a0de6cfee1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,8 +38,8 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.2.2,<8.3.0
-    wasabi>=0.9.1,<1.2.0
+    thinc>=9.0.0.dev0,<9.1.0
+    wasabi>=0.9.1,<1.1.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
     weasel>=0.1.0,<0.5.0
diff --git a/setup.py b/setup.py
index a80016ea9ea..0eb529c2098 100755
--- a/setup.py
+++ b/setup.py
@@ -47,6 +47,7 @@
     "spacy.pipeline._parser_internals.arc_eager",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
+    "spacy.pipeline._parser_internals.search",
     "spacy.pipeline._parser_internals._state",
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
@@ -66,6 +67,7 @@
     "spacy.matcher.dependencymatcher",
     "spacy.symbols",
     "spacy.vectors",
+    "spacy.tests.parser._search",
 ]
 COMPILE_OPTIONS = {
     "msvc": ["/Ox", "/EHsc"],
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index cb323e98891..10a9f0bc485 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -3,7 +3,6 @@
 cimport numpy as np
 from libc.math cimport exp
 from libc.stdlib cimport calloc, free, realloc
-from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport saxpy, sgemm
 from thinc.backends.linalg cimport Vec, VecVec
 
@@ -116,14 +115,10 @@ cdef void predict_states(
         n.hiddens * n.pieces
     )
     for i in range(n.states):
-        VecVec.add_i(
-            &A.unmaxed[i*n.hiddens*n.pieces],
-            W.feat_bias, 1.,
-            n.hiddens * n.pieces
-        )
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
         for j in range(n.hiddens):
             index = i * n.hiddens * n.pieces + j * n.pieces
-            which = Vec.arg_max(&A.unmaxed[index], n.pieces)
+            which = _arg_max(&A.unmaxed[index], n.pieces)
             A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
     memset(A.scores, 0, n.states * n.classes * sizeof(float))
     if W.hidden_weights == NULL:
@@ -138,7 +133,7 @@ cdef void predict_states(
         )
         # Add bias
         for i in range(n.states):
-            VecVec.add_i(&A.scores[i*n.classes], W.hidden_bias, 1., n.classes)
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
     # Set unseen classes to minimum value
     i = 0
     min_ = A.scores[0]
@@ -187,7 +182,8 @@ cdef void cpu_log_loss(
     """Do multi-label log loss"""
     cdef double max_, gmax, Z, gZ
     best = arg_max_if_gold(scores, costs, is_valid, O)
-    guess = Vec.arg_max(scores, O)
+    guess = _arg_max(scores, O)
+
     if best == -1 or guess == -1:
         # These shouldn't happen, but if they do, we want to make sure we don't
         # cause an OOB access.
@@ -529,3 +525,15 @@ cdef class precompute_hiddens:
             return d_best.reshape((d_best.shape + (1,)))
 
         return state_vector, backprop_relu
+
+cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pxd b/spacy/pipeline/_parser_internals/_beam_utils.pxd
index 596306b2319..571f246b1e3 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd
@@ -1,7 +1,6 @@
 from ...typedefs cimport class_t, hash_t
 
-
-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
 
 cdef int check_final_state(void* _state, void* extra_args) except -1
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index ac04be5a719..d004d313c3e 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -1,21 +1,17 @@
 # cython: infer_types=True
 import numpy
-
-from thinc.extra.search cimport Beam
-
-from thinc.extra.search import MaxViolation
-
-from thinc.extra.search cimport MaxViolation
+from cpython.ref cimport PyObject, Py_XDECREF
 
 from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
-
+from .search cimport Beam, MaxViolation
+from .search import MaxViolation
 from .stateclass cimport StateC, StateClass
 
 
-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
     dest = <StateC*>_dest
     src = <StateC*>_src
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index e1375494482..10f2649baa0 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -18,8 +18,7 @@ from ._state cimport ArcC, StateC
 from .stateclass cimport StateClass
 
 from ...errors import Errors
-
-from thinc.extra.search cimport Beam
+from .search cimport Beam
 
 
 cdef weight_t MIN_SCORE = -90000
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index c77b7b50f2d..6851f9f2096 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -8,8 +8,6 @@ from libc.stdint cimport int32_t
 
 from collections import Counter
 
-from thinc.extra.search cimport Beam
-
 from ...tokens.doc cimport Doc
 
 from ...tokens.span import Span
@@ -23,6 +21,8 @@ from ...typedefs cimport attr_t, weight_t
 from ...training import split_bilu_label
 
 from ...training.example cimport Example
+from .search cimport Beam
+from .stateclass cimport StateClass
 from ._state cimport StateC
 from .stateclass cimport StateClass
 from .transition_system cimport Transition, do_func_t
diff --git a/spacy/pipeline/_parser_internals/search.pxd b/spacy/pipeline/_parser_internals/search.pxd
new file mode 100644
index 00000000000..dfe30e1c130
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/search.pxd
@@ -0,0 +1,89 @@
+from cymem.cymem cimport Pool
+
+from libc.stdint cimport uint32_t
+from libc.stdint cimport uint64_t
+from libcpp.pair cimport pair
+from libcpp.queue cimport priority_queue
+from libcpp.vector cimport vector
+
+from ...typedefs cimport class_t, weight_t, hash_t
+
+ctypedef pair[weight_t, size_t] Entry
+ctypedef priority_queue[Entry] Queue
+
+
+ctypedef int (*trans_func_t)(void* dest, void* src, class_t clas, void* x) except -1
+
+ctypedef void* (*init_func_t)(Pool mem, int n, void* extra_args) except NULL
+
+ctypedef int (*del_func_t)(Pool mem, void* state, void* extra_args) except -1
+
+ctypedef int (*finish_func_t)(void* state, void* extra_args) except -1
+
+ctypedef hash_t (*hash_func_t)(void* state, void* x) except 0
+
+
+cdef struct _State:
+    void* content
+    class_t* hist
+    weight_t score
+    weight_t loss
+    int i
+    int t
+    bint is_done
+
+
+cdef class Beam:
+    cdef Pool mem
+    cdef class_t nr_class
+    cdef class_t width
+    cdef class_t size
+    cdef public weight_t min_density
+    cdef int t
+    cdef readonly bint is_done
+    cdef list histories
+    cdef list _parent_histories
+    cdef weight_t** scores
+    cdef int** is_valid
+    cdef weight_t** costs
+    cdef _State* _parents
+    cdef _State* _states
+    cdef del_func_t del_func
+
+    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1
+
+    cdef inline void* at(self, int i) nogil:
+        return self._states[i].content
+
+    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1
+    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
+                     void* extra_args) except -1
+    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
+ 
+
+    cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
+        self.scores[i][j] = score
+        self.is_valid[i][j] = is_valid
+        self.costs[i][j] = cost
+
+    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
+                     const weight_t* costs) except -1
+    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1
+
+
+cdef class MaxViolation:
+    cdef Pool mem
+    cdef weight_t cost
+    cdef weight_t delta
+    cdef readonly weight_t p_score
+    cdef readonly weight_t g_score
+    cdef readonly double Z
+    cdef readonly double gZ
+    cdef class_t n
+    cdef readonly list p_hist
+    cdef readonly list g_hist
+    cdef readonly list p_probs
+    cdef readonly list g_probs
+
+    cpdef int check(self, Beam pred, Beam gold) except -1
+    cpdef int check_crf(self, Beam pred, Beam gold) except -1
diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
new file mode 100644
index 00000000000..1d9b6dd7adf
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -0,0 +1,306 @@
+# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
+cimport cython
+from libc.string cimport memset, memcpy
+from libc.math cimport log, exp
+import math
+
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+
+
+cdef class Beam:
+    def __init__(self, class_t nr_class, class_t width, weight_t min_density=0.0):
+        assert nr_class != 0
+        assert width != 0
+        self.nr_class = nr_class
+        self.width = width
+        self.min_density = min_density
+        self.size = 1
+        self.t = 0
+        self.mem = Pool()
+        self.del_func = NULL
+        self._parents = <_State*>self.mem.alloc(self.width, sizeof(_State))
+        self._states = <_State*>self.mem.alloc(self.width, sizeof(_State))
+        cdef int i
+        self.histories = [[] for i in range(self.width)]
+        self._parent_histories = [[] for i in range(self.width)]
+
+        self.scores = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
+        self.is_valid = <int**>self.mem.alloc(self.width, sizeof(weight_t*))
+        self.costs = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
+        for i in range(self.width):
+            self.scores[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
+            self.is_valid[i] = <int*>self.mem.alloc(self.nr_class, sizeof(int))
+            self.costs[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
+
+    def __len__(self):
+        return self.size
+
+    property score:
+        def __get__(self):
+            return self._states[0].score
+
+    property min_score:
+        def __get__(self):
+            return self._states[self.size-1].score
+
+    property loss:
+        def __get__(self):
+            return self._states[0].loss
+
+    property probs:
+        def __get__(self):
+            return _softmax([self._states[i].score for i in range(self.size)])
+
+    property scores:
+        def __get__(self):
+            return [self._states[i].score for i in range(self.size)]
+
+    property histories:
+        def __get__(self):
+            return self.histories
+
+    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
+                     const weight_t* costs) except -1:
+        cdef int j
+        for j in range(self.nr_class):
+            self.scores[i][j] = scores[j]
+            self.is_valid[i][j] = is_valid[j]
+            self.costs[i][j] = costs[j]
+
+    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
+        cdef int i, j
+        for i in range(self.width):
+            memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
+            memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
+            memcpy(self.costs[i], costs[i], sizeof(int) * self.nr_class)
+
+    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1:
+        for i in range(self.width):
+            self._states[i].content = init_func(self.mem, n, extra_args)
+            self._parents[i].content = init_func(self.mem, n, extra_args)
+        self.del_func = del_func
+
+    def __dealloc__(self):
+        if self.del_func == NULL:
+            return
+
+        for i in range(self.width):
+            self.del_func(self.mem, self._states[i].content, NULL)
+            self.del_func(self.mem, self._parents[i].content, NULL)
+
+    @cython.cdivision(True)
+    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
+                     void* extra_args) except -1:
+        cdef weight_t** scores = self.scores
+        cdef int** is_valid = self.is_valid
+        cdef weight_t** costs = self.costs
+
+        cdef Queue* q = new Queue()
+        self._fill(q, scores, is_valid)
+        # For a beam of width k, we only ever need 2k state objects. How?
+        # Each transition takes a parent and a class and produces a new state.
+        # So, we don't need the whole history --- just the parent. So at
+        # each step, we take a parent, and apply one or more extensions to
+        # it.
+        self._parents, self._states = self._states, self._parents
+        self._parent_histories, self.histories = self.histories, self._parent_histories
+        cdef weight_t score
+        cdef int p_i
+        cdef int i = 0
+        cdef class_t clas
+        cdef _State* parent
+        cdef _State* state
+        cdef hash_t key
+        cdef PreshMap seen_states = PreshMap(self.width)
+        cdef uint64_t is_seen
+        cdef uint64_t one = 1
+        while i < self.width and not q.empty():
+            data = q.top()
+            p_i = data.second / self.nr_class
+            clas = data.second % self.nr_class
+            score = data.first
+            q.pop()
+            parent = &self._parents[p_i]
+            # Indicates terminal state reached; i.e. state is done
+            if parent.is_done:
+                # Now parent will not be changed, so we don't have to copy.
+                # Once finished, should also be unbranching.
+                self._states[i], parent[0] = parent[0], self._states[i]
+                parent.i = self._states[i].i
+                parent.t = self._states[i].t
+                parent.is_done = self._states[i].t
+                self._states[i].score = score
+                self.histories[i] = list(self._parent_histories[p_i])
+                i += 1
+            else:
+                state = &self._states[i]
+                # The supplied transition function should adjust the destination
+                # state to be the result of applying the class to the source state
+                transition_func(state.content, parent.content, clas, extra_args)
+                key = hash_func(state.content, extra_args) if hash_func is not NULL else 0
+                is_seen = <uint64_t>seen_states.get(key)
+                if key == 0 or key == 1 or not is_seen:
+                    if key != 0 and key != 1:
+                        seen_states.set(key, <void*>one)
+                    state.score = score
+                    state.loss = parent.loss + costs[p_i][clas]
+                    self.histories[i] = list(self._parent_histories[p_i])
+                    self.histories[i].append(clas)
+                    i += 1
+        del q
+        self.size = i
+        assert self.size >= 1
+        for i in range(self.width):
+            memset(self.scores[i], 0, sizeof(weight_t) * self.nr_class)
+            memset(self.costs[i], 0, sizeof(weight_t) * self.nr_class)
+            memset(self.is_valid[i], 0, sizeof(int) * self.nr_class)
+        self.t += 1
+
+    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1:
+        cdef int i
+        for i in range(self.size):
+            if not self._states[i].is_done:
+                self._states[i].is_done = finish_func(self._states[i].content, extra_args)
+        for i in range(self.size):
+            if not self._states[i].is_done:
+                self.is_done = False
+                break
+        else:
+            self.is_done = True
+
+    @cython.cdivision(True)
+    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1:
+        """Populate the queue from a k * n matrix of scores, where k is the
+        beam-width, and n is the number of classes.
+        """
+        cdef Entry entry
+        cdef weight_t score
+        cdef _State* s
+        cdef int i, j, move_id
+        assert self.size >= 1
+        cdef vector[Entry] entries
+        for i in range(self.size):
+            s = &self._states[i]
+            move_id = i * self.nr_class
+            if s.is_done:
+                # Update score by path average, following TACL '13 paper.
+                if self.histories[i]:
+                    entry.first = s.score + (s.score / self.t)
+                else:
+                    entry.first = s.score
+                entry.second = move_id
+                entries.push_back(entry)
+            else:
+                for j in range(self.nr_class):
+                    if is_valid[i][j]:
+                        entry.first = s.score + scores[i][j]
+                        entry.second = move_id + j
+                        entries.push_back(entry)
+        cdef double max_, Z, cutoff
+        if self.min_density == 0.0:
+            for i in range(entries.size()):
+                q.push(entries[i])
+        elif not entries.empty():
+            max_ = entries[0].first
+            Z = 0.
+            cutoff = 0.
+            # Softmax into probabilities, so we can prune
+            for i in range(entries.size()):
+                if entries[i].first > max_:
+                    max_ = entries[i].first
+            for i in range(entries.size()):
+                Z += exp(entries[i].first-max_)
+            cutoff = (1. / Z) * self.min_density
+            for i in range(entries.size()):
+                prob = exp(entries[i].first-max_) / Z
+                if prob >= cutoff:
+                    q.push(entries[i])
+
+
+cdef class MaxViolation:
+    def __init__(self):
+        self.p_score = 0.0
+        self.g_score = 0.0
+        self.Z = 0.0
+        self.gZ = 0.0
+        self.delta = -1
+        self.cost = 0
+        self.p_hist = []
+        self.g_hist = []
+        self.p_probs = []
+        self.g_probs = []
+
+    cpdef int check(self, Beam pred, Beam gold) except -1:
+        cdef _State* p = &pred._states[0]
+        cdef _State* g = &gold._states[0]
+        cdef weight_t d = p.score - g.score
+        if p.loss >= 1 and (self.cost == 0 or d > self.delta):
+            self.cost = p.loss
+            self.delta = d
+            self.p_hist = list(pred.histories[0])
+            self.g_hist = list(gold.histories[0])
+            self.p_score = p.score
+            self.g_score = g.score
+            self.Z = 1e-10
+            self.gZ = 1e-10
+            for i in range(pred.size):
+                if pred._states[i].loss > 0:
+                    self.Z += exp(pred._states[i].score)
+            for i in range(gold.size):
+                if gold._states[i].loss == 0:
+                    prob = exp(gold._states[i].score)
+                    self.Z += prob
+                    self.gZ += prob
+
+    cpdef int check_crf(self, Beam pred, Beam gold) except -1:
+        d = pred.score - gold.score
+        seen_golds = set([tuple(gold.histories[i]) for i in range(gold.size)])
+        if pred.loss > 0 and (self.cost == 0 or d > self.delta):
+            p_hist = []
+            p_scores = []
+            g_hist = []
+            g_scores = []
+            for i in range(pred.size):
+                if pred._states[i].loss > 0:
+                    p_scores.append(pred._states[i].score)
+                    p_hist.append(list(pred.histories[i]))
+                # This can happen from non-monotonic actions
+                # If we find a better gold analysis this way, be sure to keep it.
+                elif pred._states[i].loss <= 0 \
+                and tuple(pred.histories[i]) not in seen_golds:
+                    g_scores.append(pred._states[i].score)
+                    g_hist.append(list(pred.histories[i]))
+            for i in range(gold.size):
+                if gold._states[i].loss == 0:
+                    g_scores.append(gold._states[i].score)
+                    g_hist.append(list(gold.histories[i]))
+
+            all_probs = _softmax(p_scores + g_scores)
+            p_probs = all_probs[:len(p_scores)]
+            g_probs_all = all_probs[len(p_scores):]
+            g_probs = _softmax(g_scores)
+
+            self.cost = pred.loss
+            self.delta = d
+            self.p_hist = p_hist
+            self.g_hist = g_hist
+            # TODO: These variables are misnamed! These are the gradients of the loss.
+            self.p_probs = p_probs
+            # Intuition here:
+            # The gradient of the loss is:
+            # P(model) - P(truth)
+            # Normally, P(truth) is 1 for the gold
+            # But, if we want to do the "partial credit" scheme, we want
+            # to create a distribution over the gold, proportional to the scores
+            # awarded.
+            self.g_probs = [x-y for x, y in zip(g_probs_all, g_probs)]
+
+
+def _softmax(nums):
+    if not nums:
+        return []
+    max_ = max(nums)
+    nums = [(exp(n-max_) if n is not None else None) for n in nums]
+    Z = sum(n for n in nums if n is not None)
+    return [(n/Z if n is not None else None) for n in nums]
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 2ef639cad52..f9a8ae10561 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -4,8 +4,9 @@
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+from thinc.api import Config, Model
 from thinc.types import ArrayXd, Floats2d, Ints1d
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 
 from .. import util
 from ..errors import Errors
@@ -131,7 +132,9 @@ def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
     ) -> Tuple[float, List[Floats2d]]:
         validate_examples(examples, "EditTreeLemmatizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
+        loss_func = LegacySequenceCategoricalCrossentropy(
+            normalize=False, missing_value=-1
+        )
 
         truths = []
         for eg in examples:
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index cc8f87936b9..d3068bdffdd 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,7 +1,8 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Callable, Dict, Iterable, List, Optional, Union
 import srsly
-from thinc.api import SequenceCategoricalCrossentropy, Model, Config
+from thinc.api import Model, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 from itertools import islice
 from typing import Callable, Dict, Optional, Union
@@ -302,7 +303,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 521afe1d181..185430c122c 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -4,7 +4,9 @@ from itertools import islice
 from typing import Callable, Optional
 
 import srsly
-from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.api import Model, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+
 from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
@@ -163,7 +165,7 @@ class SentenceRecognizer(Tagger):
         """
         validate_examples(examples, "SentenceRecognizer.get_loss")
         labels = self.labels
-        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=labels, normalize=False)
         truths = []
         for eg in examples:
             eg_truth = []
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 8ecd0c46ee0..f25ee00407b 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -2,7 +2,8 @@
 from typing import Callable, Dict, Iterable, List, Optional, Union
 import numpy
 import srsly
-from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.api import Model, set_dropout_rate, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 import warnings
 from itertools import islice
@@ -242,7 +243,7 @@ class Tagger(TrainablePipe):
 
         DOCS: https://spacy.io/api/tagger#rehearse
         """
-        loss_func = SequenceCategoricalCrossentropy()
+        loss_func = LegacySequenceCategoricalCrossentropy()
         if losses is None:
             losses = {}
         losses.setdefault(self.name, 0.0)
@@ -273,7 +274,7 @@ class Tagger(TrainablePipe):
         DOCS: https://spacy.io/api/tagger#get_loss
         """
         validate_examples(examples, "Tagger.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"], label_smoothing=self.cfg["label_smoothing"])
+        loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
         # Convert empty tag "" to missing value None so that both misaligned
         # tokens and tokens with missing annotation have the default missing
         # value None.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index b8ebbf8ca88..d310df92151 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -13,23 +13,20 @@ from libcpp.vector cimport vector
 
 import random
 
+import srsly
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
+import numpy.random
 import numpy
 import numpy.random
 import srsly
 from thinc.api import CupyOps, NumpyOps, set_dropout_rate
 
-from ..ml.parser_model cimport (
-    ActivationsC,
-    SizesC,
-    WeightsC,
-    alloc_activations,
-    arg_max_if_valid,
-    cpu_log_loss,
-    free_activations,
-    get_c_sizes,
-    get_c_weights,
-    predict_states,
-)
+from ._parser_internals.stateclass cimport StateClass
+from ._parser_internals.search cimport Beam
+from ..ml.parser_model cimport alloc_activations, free_activations
+from ..ml.parser_model cimport predict_states, arg_max_if_valid
+from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
+from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
 from ._parser_internals.stateclass cimport StateClass
 
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 2a9f441c9b0..6085b89cf02 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -1,5 +1,10 @@
 import pytest
+from spacy.util import get_lang_class
+import functools
 from hypothesis import settings
+import inspect
+import importlib
+import sys
 
 from spacy.util import get_lang_class
 
@@ -48,6 +53,33 @@ def getopt(opt):
             pytest.skip("not referencing any issues")
 
 
+# Decorator for Cython-built tests
+# https://shwina.github.io/cython-testing/
+def cytest(func):
+    """
+    Wraps `func` in a plain Python function.
+    """
+
+    @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+        bound = inspect.signature(func).bind(*args, **kwargs)
+        return func(*bound.args, **bound.kwargs)
+
+    return wrapped
+
+
+def register_cython_tests(cython_mod_name: str, test_mod_name: str):
+    """
+    Registers all callables with name `test_*` in Cython module `cython_mod_name`
+    as attributes in module `test_mod_name`, making them discoverable by pytest.
+    """
+    cython_mod = importlib.import_module(cython_mod_name)
+    for name in dir(cython_mod):
+        item = getattr(cython_mod, name)
+        if callable(item) and name.startswith("test_"):
+            setattr(sys.modules[test_mod_name], name, item)
+
+
 # Fixtures for language tokenizers (languages sorted alphabetically)
 
 
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
new file mode 100644
index 00000000000..23fc8164412
--- /dev/null
+++ b/spacy/tests/parser/_search.pyx
@@ -0,0 +1,119 @@
+# cython: infer_types=True, binding=True
+from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
+from spacy.typedefs cimport class_t, weight_t
+from cymem.cymem cimport Pool
+
+from ..conftest import cytest
+import pytest
+
+cdef struct TestState:
+    int length
+    int x
+    Py_UNICODE* string
+
+
+cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1:
+    dest_state = <TestState*>dest
+    src_state = <TestState*>src
+    dest_state.length = src_state.length
+    dest_state.x = src_state.x
+    dest_state.x += clas
+    if extra_args != NULL:
+        dest_state.string = <Py_UNICODE*>extra_args
+    else:
+        dest_state.string = src_state.string
+
+
+cdef void* initialize(Pool mem, int n, void* extra_args) except NULL:
+    state = <TestState*>mem.alloc(1, sizeof(TestState))
+    state.length = n
+    state.x = 1
+    if extra_args == NULL:
+        state.string = u'default'
+    else:
+        state.string = <Py_UNICODE*>extra_args
+    return state
+
+
+cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
+    state = <TestState*>state
+    mem.free(state)
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width",
+    [
+        (2, 3),
+        (3, 6),
+        (4, 20),
+    ]
+)
+def test_init(nr_class, beam_width):
+    b = Beam(nr_class, beam_width)
+    assert b.size == 1
+    assert b.width == beam_width
+    assert b.nr_class == nr_class
+
+@cytest
+def test_init_violn():
+    MaxViolation()
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length",
+    [
+        (2, 3, 3),
+        (3, 6, 15),
+        (4, 20, 32),
+    ]
+)
+def test_initialize(nr_class, beam_width, length):
+    b = Beam(nr_class, beam_width)
+    b.initialize(initialize, destroy, length, NULL)
+    for i in range(b.width):
+        s = <TestState*>b.at(i)
+        assert s.length == length, s.length
+        assert s.string == 'default'
+
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length,extra",
+    [
+        (2, 3, 4, None),
+        (3, 6, 15, u"test beam 1"),
+    ]
+)
+def test_initialize_extra(nr_class, beam_width, length, extra):
+    b = Beam(nr_class, beam_width)
+    if extra is None:
+        b.initialize(initialize, destroy, length, NULL)
+    else:
+        b.initialize(initialize, destroy, length, <void*><Py_UNICODE*>extra)
+    for i in range(b.width):
+        s = <TestState*>b.at(i)
+        assert s.length == length
+
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length",
+    [
+        (3, 6, 15),
+        (4, 20, 32),
+    ]
+)
+def test_transition(nr_class, beam_width, length):
+    b = Beam(nr_class, beam_width)
+    b.initialize(initialize, destroy, length, NULL)
+    b.set_cell(0, 2, 30, True, 0)
+    b.set_cell(0, 1, 42, False, 0)
+    b.advance(transition, NULL, NULL)
+    assert b.size == 1, b.size
+    assert b.score == 30, b.score
+    s = <TestState*>b.at(0)
+    assert s.x == 3
+    assert b._states[0].score == 30, b._states[0].score
+    b.set_cell(0, 1, 10, True, 0)
+    b.set_cell(0, 2, 20, True, 0)
+    b.advance(transition, NULL, NULL)
+    assert b._states[0].score == 50, b._states[0].score
+    assert b._states[1].score == 40
+    s = <TestState*>b.at(0)
+    assert s.x == 5
diff --git a/spacy/tests/parser/test_search.py b/spacy/tests/parser/test_search.py
new file mode 100644
index 00000000000..136c3a11b8a
--- /dev/null
+++ b/spacy/tests/parser/test_search.py
@@ -0,0 +1,3 @@
+from ..conftest import register_cython_tests
+
+register_cython_tests("spacy.tests.parser._search", __name__)

From 549d40d4e2698ae95c37b39089ad9b0598ed85b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 22 Dec 2022 10:23:31 +0100
Subject: [PATCH 251/504] Fix fallout from a previous merge

---
 spacy/pipeline/textcat_multilabel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index ac024ba3639..9ed9770086c 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -163,6 +163,7 @@ def __init__(
         name (str): The component instance name, used to add entries to the
             losses during training.
         threshold (float): Cutoff to consider a prediction "positive".
+        scorer (Optional[Callable]): The scoring method.
         save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/textcategorizer#init

From 957221af4a881c35c2326abdbd940c4521e29cb0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 29 Dec 2022 08:03:24 +0100
Subject: [PATCH 252/504] Adjust to new `Schedule` class and pass scores to
 `Optimizer` (#12008)

* Adjust to new `Schedule` class and pass scores to `Optimizer`

Requires https://github.com/explosion/thinc/pull/804

* Bump minimum Thinc requirement to 9.0.0.dev1
---
 pyproject.toml             |  2 +-
 requirements.txt           |  2 +-
 setup.cfg                  |  4 ++--
 spacy/training/batchers.py | 38 ++++++++++++++++----------------------
 spacy/training/loop.py     |  3 ++-
 spacy/util.py              | 13 +++++++++----
 6 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 77e1471426f..58e4382e829 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev0,<9.1.0",
+    "thinc>=9.0.0.dev1,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 61c19d68014..0492e3c3631 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev0,<9.1.0
+thinc>=9.0.0.dev1,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 9a0de6cfee1..c555218c06a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,8 +38,8 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev0,<9.1.0
-    wasabi>=0.9.1,<1.1.0
+    thinc>=9.0.0.dev1,<9.1.0
+    wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
     weasel>=0.1.0,<0.5.0
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index 050c3351b08..519e61315da 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,20 +1,9 @@
 import itertools
-from functools import partial
-from typing import (
-    Any,
-    Callable,
-    Iterable,
-    Iterator,
-    List,
-    Optional,
-    Sequence,
-    TypeVar,
-    Union,
-)
+from thinc.schedules import Schedule, constant as constant_schedule
 
 from ..util import minibatch, registry
 
-Sizing = Union[Sequence[int], int]
+Sizing = Union[Sequence[int], int, Schedule[int]]
 ItemT = TypeVar("ItemT")
 BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
 
@@ -119,12 +108,13 @@ def minibatch_by_padded_size(
         The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = itertools.repeat(size)  # type: Iterator[int]
+        size_ = constant_schedule(size)
     else:
-        size_ = iter(size)
-    for outer_batch in minibatch(seqs, size=buffer):
+        assert isinstance(size, Schedule)
+        size_ = size
+    for step, outer_batch in enumerate(minibatch(seqs, size=buffer)):
         outer_batch = list(outer_batch)
-        target_size = next(size_)
+        target_size = size_(step)
         for indices in _batch_by_length(outer_batch, target_size, get_length):
             subbatch = [outer_batch[i] for i in indices]
             padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
@@ -155,10 +145,12 @@ def minibatch_by_words(
         item. The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = itertools.repeat(size)  # type: Iterator[int]
+        size_ = constant_schedule(size)
     else:
-        size_ = iter(size)
-    target_size = next(size_)
+        assert isinstance(size, Schedule)
+        size_ = size
+    step = 0
+    target_size = size_(step)
     tol_size = target_size * tolerance
     batch = []
     overflow = []
@@ -183,7 +175,8 @@ def minibatch_by_words(
         else:
             if batch:
                 yield batch
-            target_size = next(size_)
+            step += 1
+            target_size = size_(step)
             tol_size = target_size * tolerance
             batch = overflow
             batch_size = overflow_size
@@ -201,7 +194,8 @@ def minibatch_by_words(
             else:
                 if batch:
                     yield batch
-                target_size = next(size_)
+                step += 1
+                target_size = size_(step)
                 tol_size = target_size * tolerance
                 batch = [seq]
                 batch_size = n_words
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 56df5395720..05c59fc9877 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -215,7 +215,7 @@ def train_while_improving(
         if before_update:
             before_update_args = {"step": step, "epoch": epoch}
             before_update(nlp, before_update_args)
-        dropout = next(dropouts)  # type: ignore
+        dropout = dropouts(optimizer.step)  # type: ignore
         for subbatch in subdivide_batch(batch, accumulate_gradient):
             nlp.update(
                 subbatch,
@@ -241,6 +241,7 @@ def train_while_improving(
                     score, other_scores = evaluate()
             else:
                 score, other_scores = evaluate()
+            optimizer.last_score = score
             results.append((score, step))
             is_best_checkpoint = score == max(results)[0]
         else:
diff --git a/spacy/util.py b/spacy/util.py
index 463ac219bf5..551f78cc969 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1,7 +1,12 @@
 import functools
 import importlib
 import importlib.util
-import inspect
+import re
+from pathlib import Path
+import thinc
+from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
+from thinc.api import ConfigValidationError, Model, constant as constant_schedule
+import functools
 import itertools
 import logging
 import os
@@ -1637,12 +1642,12 @@ def minibatch(items, size):
     so that batch-size can vary on each step.
     """
     if isinstance(size, int):
-        size_ = itertools.repeat(size)
+        size_ = constant_schedule(size)
     else:
         size_ = size
     items = iter(items)
-    while True:
-        batch_size = next(size_)
+    for step in itertools.count():
+        batch_size = size_(step)
         batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break

From 378301633e6dea1ce120ed0e8b7bd780c377c4be Mon Sep 17 00:00:00 2001
From: Tetsuo Kiso <tetsuok@users.noreply.github.com>
Date: Wed, 4 Jan 2023 01:43:09 +0900
Subject: [PATCH 253/504] Delete unused imports for StringStore (#12040)

---
 spacy/lexeme.pxd    | 18 ++++--------------
 spacy/tokenizer.pxd |  4 ++++
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index ff2e4f92edf..2d14edcd6b0 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,20 +1,10 @@
 from numpy cimport ndarray
 
-from .attrs cimport (
-    ID,
-    LANG,
-    LENGTH,
-    LOWER,
-    NORM,
-    ORTH,
-    PREFIX,
-    SHAPE,
-    SUFFIX,
-    attr_id_t,
-)
-from .strings cimport StringStore
+from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
+from .attrs cimport attr_id_t
+from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
+
 from .structs cimport LexemeC
-from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
 from .vocab cimport Vocab
 
 
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index f64e0e93413..c963dcbcfa4 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -2,6 +2,10 @@ from cymem.cymem cimport Pool
 from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 
+from .typedefs cimport hash_t
+from .structs cimport LexemeC, SpanC, TokenC
+from .tokens.doc cimport Doc
+from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
 from .strings cimport StringStore
 from .structs cimport LexemeC, SpanC, TokenC

From f88f391abcb93c061577781965019d44dcd40819 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Mon, 9 Jan 2023 20:15:02 +0100
Subject: [PATCH 254/504] Pass `step=0` to `Schedule` class to yield initial
 learning rate (#12078)

---
 spacy/training/loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 05c59fc9877..58d5b06786f 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -111,7 +111,7 @@ def save_checkpoint(is_best):
         stdout.write(
             msg.info(f"Set annotations on update for: {annotating_components}") + "\n"
         )
-    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
+    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate(step=0)}") + "\n")
     with nlp.select_pipes(disable=frozen_components):
         log_step, finalize_logger = train_logger(nlp, stdout, stderr)
     try:

From b6f1efe8f6dc2979f0a371546da47b0dfeae4336 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 11 Jan 2023 18:57:50 +0100
Subject: [PATCH 255/504] update tests from master to follow v4 principles

---
 spacy/tests/pipeline/test_entity_ruler.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 9f5204006ec..ae57da5134c 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -357,9 +357,9 @@ def test_entity_ruler_overlapping_spans(nlp):
     assert doc.ents[0].label_ == "FOOBAR"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+@pytest.mark.parametrize()
+def test_entity_ruler_fuzzy_pipe(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
     ruler.add_patterns(patterns)
     doc = nlp("helloo")
@@ -367,9 +367,9 @@ def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fuzzy(nlp, entity_ruler_factory):
-    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
+@pytest.mark.parametrize()
+def test_entity_ruler_fuzzy(nlp):
+    ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
     ruler.add_patterns(patterns)
     doc = nlp("helloo")
@@ -377,15 +377,14 @@ def test_entity_ruler_fuzzy(nlp, entity_ruler_factory):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
-def test_entity_ruler_fuzzy_disabled(nlp, entity_ruler_factory):
+@pytest.mark.parametrize()
+def test_entity_ruler_fuzzy_disabled(nlp):
     @registry.misc("test_fuzzy_compare_disabled")
     def make_test_fuzzy_compare_disabled():
         return lambda x, y, z: False
 
     ruler = nlp.add_pipe(
-        entity_ruler_factory,
-        name="entity_ruler",
+        "entity_ruler",
         config={"matcher_fuzzy_compare": {"@misc": "test_fuzzy_compare_disabled"}},
     )
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]

From 571afa07315b3bd4da9fd70c92d1aa78d91d40c1 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 11 Jan 2023 19:04:06 +0100
Subject: [PATCH 256/504] update tests from master to follow v4 principles (2)

---
 spacy/tests/pipeline/test_entity_ruler.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index ae57da5134c..6bff3288dc3 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -357,7 +357,6 @@ def test_entity_ruler_overlapping_spans(nlp):
     assert doc.ents[0].label_ == "FOOBAR"
 
 
-@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy_pipe(nlp):
     ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
@@ -367,7 +366,6 @@ def test_entity_ruler_fuzzy_pipe(nlp):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy(nlp):
     ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
@@ -377,7 +375,6 @@ def test_entity_ruler_fuzzy(nlp):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy_disabled(nlp):
     @registry.misc("test_fuzzy_compare_disabled")
     def make_test_fuzzy_compare_disabled():

From 12eba04fecf9da7ee0593077ac3f2af2fb4b1c7d Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 13 Jan 2023 11:14:58 +0100
Subject: [PATCH 257/504] fix anchors (#12095)

---
 website/docs/api/stringstore.mdx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index d4d85e6d56a..269ac2d0c4b 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -97,7 +97,7 @@ Iterate over the stored strings in insertion order.
 | ----------- | ------------------------------ |
 | **RETURNS** | A string in the store. ~~str~~ |
 
-## StringStore.items {#iter tag="method" new="4"}
+## StringStore.items {id="items", tag="method", version="4"}
 
 Iterate over the stored string-hash pairs in insertion order.
 
@@ -113,7 +113,7 @@ Iterate over the stored string-hash pairs in insertion order.
 | ----------- | ------------------------------------------------------ |
 | **RETURNS** | A list of string-hash pairs. ~~List[Tuple[str, int]]~~ |
 
-## StringStore.keys {#iter tag="method" new="4"}
+## StringStore.keys {id="keys", tag="method", version="4"}
 
 Iterate over the stored strings in insertion order.
 
@@ -129,7 +129,7 @@ Iterate over the stored strings in insertion order.
 | ----------- | -------------------------------- |
 | **RETURNS** | A list of strings. ~~List[str]~~ |
 
-## StringStore.values {#iter tag="method" new="4"}
+## StringStore.values {id="values", tag="method", version="4"}
 
 Iterate over the stored string hashes in insertion order.
 

From a8fdbfe23ae5e88348378cdc81541a86cd54282c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 16 Jan 2023 10:25:53 +0100
Subject: [PATCH 258/504] Add
 `TrainablePipe.{distill,get_teacher_student_loss}` (#12016)

* Add `TrainablePipe.{distill,get_teacher_student_loss}`

This change adds two methods:

- `TrainablePipe::distill` which performs a training step of a
   student pipe on a teacher pipe, giving a batch of `Doc`s.
- `TrainablePipe::get_teacher_student_loss` computes the loss
  of a student relative to the teacher.

The `distill` or `get_teacher_student_loss` methods are also implemented
in the tagger, edit tree lemmatizer, and parser pipes, to enable
distillation in those pipes and as an example for other pipes.

* Fix stray `Beam` import

* Fix incorrect import

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* TrainablePipe.distill: use `Iterable[Example]`

* Add Pipe.is_distillable method

* Add `validate_distillation_examples`

This first calls `validate_examples` and then checks that the
student/teacher tokens are the same.

* Update distill documentation

* Add distill documentation for all pipes that support distillation

* Fix incorrect identifier

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Add comment to explain `is_distillable`

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/errors.py                               |   3 +
 spacy/ml/callbacks.py                         |   1 +
 spacy/pipeline/edit_tree_lemmatizer.py        |  19 +++
 spacy/pipeline/pipe.pyx                       |   4 +
 spacy/pipeline/tagger.pyx                     |  26 ++-
 spacy/pipeline/trainable_pipe.pyx             |  72 +++++++-
 spacy/pipeline/transition_parser.pyx          | 160 +++++++++++++++++-
 spacy/tests/parser/test_ner.py                |  46 +++++
 spacy/tests/parser/test_parse.py              |  49 ++++++
 .../pipeline/test_edit_tree_lemmatizer.py     |  47 +++++
 spacy/tests/pipeline/test_morphologizer.py    |   6 +
 spacy/tests/pipeline/test_senter.py           |   6 +
 spacy/tests/pipeline/test_tagger.py           |  46 +++++
 spacy/tests/pipeline/test_textcat.py          |   6 +
 spacy/tests/training/test_training.py         |  27 +--
 spacy/training/__init__.py                    |   3 +
 spacy/training/example.pyx                    |   7 +
 website/docs/api/dependencyparser.mdx         |  54 ++++++
 website/docs/api/edittreelemmatizer.mdx       |  54 ++++++
 website/docs/api/entityrecognizer.mdx         |  54 ++++++
 website/docs/api/morphologizer.mdx            |  54 ++++++
 website/docs/api/pipe.mdx                     |  61 +++++++
 website/docs/api/sentencerecognizer.mdx       |  54 ++++++
 website/docs/api/tagger.mdx                   |  54 ++++++
 website/docs/api/top-level.mdx                |   3 +-
 website/docs/usage/processing-pipelines.mdx   |  14 +-
 26 files changed, 906 insertions(+), 24 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 11b8980fd9d..9bdb66006e5 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -975,6 +975,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E4000 = ("Expected a Doc as input, but got: '{type}'")
     E4001 = ("Expected input to be one of the following types: ({expected_types}), "
              "but got '{received_type}'")
+    E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
+    E4003 = ("Training examples for distillation must have the exact same tokens in the "
+             "reference and predicted docs.")
 
 
 # fmt: on
diff --git a/spacy/ml/callbacks.py b/spacy/ml/callbacks.py
index e2378a7baf3..0783a5568a9 100644
--- a/spacy/ml/callbacks.py
+++ b/spacy/ml/callbacks.py
@@ -23,6 +23,7 @@
     "update",
     "rehearse",
     "get_loss",
+    "get_teacher_student_loss",
     "initialize",
     "begin_update",
     "finish_update",
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index f9a8ae10561..d5169178b8c 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -157,6 +157,25 @@ def get_loss(
 
         return float(loss), d_scores
 
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
     def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         n_docs = len(list(docs))
         if not any(len(doc) for doc in docs):
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index ea5fc5253d9..af7cd09f171 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -89,6 +89,10 @@ cdef class Pipe:
             return self.scorer(examples, **scorer_kwargs)
         return {}
 
+    @property
+    def is_distillable(self) -> bool:
+        return False
+
     @property
     def is_trainable(self) -> bool:
         return False
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index f25ee00407b..a8a89332bd4 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,5 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Tuple
 import numpy
 import srsly
 from thinc.api import Model, set_dropout_rate, Config
@@ -243,7 +244,6 @@ class Tagger(TrainablePipe):
 
         DOCS: https://spacy.io/api/tagger#rehearse
         """
-        loss_func = LegacySequenceCategoricalCrossentropy()
         if losses is None:
             losses = {}
         losses.setdefault(self.name, 0.0)
@@ -257,12 +257,32 @@ class Tagger(TrainablePipe):
         set_dropout_rate(self.model, drop)
         tag_scores, bp_tag_scores = self.model.begin_update(docs)
         tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
-        grads, loss = loss_func(tag_scores, tutor_tag_scores)
+        loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
         bp_tag_scores(grads)
-        self.finish_update(sgd)
+        if sgd is not None:
+            self.finish_update(sgd)
         losses[self.name] += loss
         return losses
 
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
     def get_loss(self, examples, scores):
         """Find the loss and gradient of loss for the batch of documents and
         their predicted scores.
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index bd360c9501b..3ec3e7551aa 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -7,7 +7,7 @@ import warnings
 
 from ..tokens.doc cimport Doc
 
-from ..training import validate_examples
+from ..training import validate_examples, validate_distillation_examples
 from ..errors import Errors, Warnings
 from .pipe import Pipe, deserialize_config
 from .. import util
@@ -59,7 +59,54 @@ cdef class TrainablePipe(Pipe):
         except Exception as e:
             error_handler(self.name, self, [doc], e)
 
-    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
+
+    def distill(self,
+               teacher_pipe: Optional["TrainablePipe"],
+               examples: Iterable["Example"],
+               *,
+               drop: float=0.0,
+               sgd: Optional[Optimizer]=None,
+               losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
+        """Train a pipe (the student) on the predictions of another pipe
+        (the teacher). The student is typically trained on the probability
+        distribution of the teacher, but details may differ per pipe.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
+            from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            and predicted docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+        
+        DOCS: https://spacy.io/api/pipe#distill
+        """
+        # By default we require a teacher pipe, but there are downstream
+        # implementations that don't require a pipe.
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        validate_distillation_examples(examples, "TrainablePipe.distill")
+        set_dropout_rate(self.model, drop)
+        for node in teacher_pipe.model.walk():
+            if node.name == "softmax":
+                node.attrs["softmax_normalize"] = True
+        teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples])
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+        bp_student_scores(d_scores)
+        if sgd is not None:
+            self.finish_update(sgd)
+        losses[self.name] += loss
+        return losses
+
+    def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
         """Apply the pipe to a stream of documents. This usually happens under
         the hood when the nlp object is called on a text and all components are
         applied to the Doc.
@@ -172,6 +219,19 @@ cdef class TrainablePipe(Pipe):
         """
         raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
 
+    def get_teacher_student_loss(self, teacher_scores, student_scores):
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
+        """
+        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
+
     def create_optimizer(self) -> Optimizer:
         """Create an optimizer for the pipeline component.
 
@@ -208,6 +268,14 @@ cdef class TrainablePipe(Pipe):
         """
         raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
 
+    @property
+    def is_distillable(self) -> bool:
+        # Normally a pipe overrides `get_teacher_student_loss` to implement
+        # distillation. In more exceptional cases, a pipe can provide its
+        # own `distill` implementation. If neither of these methods is
+        # overridden, the pipe does not implement distillation.
+        return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss)
+
     @property
     def is_trainable(self) -> bool:
         return True
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index d310df92151..feab7e7404b 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,7 +1,8 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
-
+from typing import Dict, Iterable, List, Optional, Tuple
+from cymem.cymem cimport Pool
 cimport numpy as np
 from cymem.cymem cimport Pool
 
@@ -14,7 +15,10 @@ from libcpp.vector cimport vector
 import random
 
 import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
+from thinc.api import chain, softmax_activation, use_ops
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d
 import numpy.random
 import numpy
 import numpy.random
@@ -34,6 +38,9 @@ from .trainable_pipe import TrainablePipe
 
 from ._parser_internals cimport _beam_utils
 
+from ..training import validate_examples, validate_get_examples
+from ..training import validate_distillation_examples
+from ..errors import Errors, Warnings
 from .. import util
 from ..errors import Errors
 from ..training import validate_examples, validate_get_examples
@@ -212,6 +219,121 @@ cdef class Parser(TrainablePipe):
         # Defined in subclasses, to avoid circular import
         raise NotImplementedError
 
+    def distill(self,
+               teacher_pipe: Optional[TrainablePipe],
+               examples: Iterable["Example"],
+               *,
+               drop: float=0.0,
+               sgd: Optional[Optimizer]=None,
+               losses: Optional[Dict[str, float]]=None):
+        """Train a pipe (the student) on the predictions of another pipe
+        (the teacher). The student is trained on the transition probabilities
+        of the teacher.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
+            from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            and predicted docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+        
+        DOCS: https://spacy.io/api/dependencyparser#distill
+        """
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+
+        validate_distillation_examples(examples, "TransitionParser.distill")
+
+        set_dropout_rate(self.model, drop)
+
+        student_docs = [eg.predicted for eg in examples]
+
+        teacher_step_model = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_step_model, backprop_tok2vec = self.model.begin_update(student_docs)
+
+        # Add softmax activation, so that we can compute student losses
+        # with cross-entropy loss.
+        with use_ops("numpy"):
+            teacher_model = chain(teacher_step_model, softmax_activation())
+            student_model = chain(student_step_model, softmax_activation())
+        
+        max_moves = self.cfg["update_with_oracle_cut_size"]
+        if max_moves >= 1:
+            # Chop sequences into lengths of this many words, to make the
+            # batch uniform length. Since we do not have a gold standard
+            # sequence, we use the teacher's predictions as the gold
+            # standard.
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            states = self._init_batch(teacher_step_model, student_docs, max_moves)
+        else:
+            states = self.moves.init_batch(student_docs)
+
+        loss = 0.0
+        n_moves = 0
+        while states:
+            # We do distillation as follows: (1) for every state, we compute the
+            # transition softmax distributions: (2) we backpropagate the error of
+            # the student (compared to the teacher) into the student model; (3)
+            # for all states, we move to the next state using the student's
+            # predictions.
+            teacher_scores = teacher_model.predict(states)
+            student_scores, backprop = student_model.begin_update(states)
+            state_loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+            backprop(d_scores)
+            loss += state_loss
+            self.transition_states(states, student_scores)
+            states = [state for state in states if not state.is_final()]
+
+            # Stop when we reach the maximum number of moves, otherwise we start
+            # to process the remainder of cut sequences again.
+            if max_moves >= 1 and n_moves >= max_moves:
+                break
+            n_moves += 1
+
+        backprop_tok2vec(student_docs)
+
+        if sgd is not None:
+            self.finish_update(sgd)
+
+        losses[self.name] += loss
+
+        del backprop
+        del backprop_tok2vec
+        teacher_step_model.clear_memory()
+        student_step_model.clear_memory()
+        del teacher_model
+        del student_model
+
+        return losses
+
+
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
     def init_multitask_objectives(self, get_examples, pipeline, **cfg):
         """Setup models for secondary objectives, to benefit from multi-task
         learning. This method is intended to be overridden by subclasses.
@@ -645,6 +767,40 @@ cdef class Parser(TrainablePipe):
                     raise ValueError(Errors.E149) from None
         return self
 
+    def _init_batch(self, teacher_step_model, docs, max_length):
+        """Make a square batch of length equal to the shortest transition
+        sequence or a cap. A long
+        doc will get multiple states. Let's say we have a doc of length 2*N,
+        where N is the shortest doc. We'll make two states, one representing
+        long_doc[:N], and another representing long_doc[N:]. In contrast to
+        _init_gold_batch, this version uses a teacher model to generate the
+        cut sequences."""
+        cdef:
+            StateClass start_state
+            StateClass state
+            Transition action
+        all_states = self.moves.init_batch(docs)
+        states = []
+        to_cut = []
+        for state, doc in zip(all_states, docs):
+            if not state.is_final():
+                if len(doc) < max_length:
+                    states.append(state)
+                else:
+                    to_cut.append(state)
+        while to_cut:
+            states.extend(state.copy() for state in to_cut)
+            # Move states forward max_length actions.
+            length = 0
+            while to_cut and length < max_length:
+                teacher_scores = teacher_step_model.predict(to_cut)
+                self.transition_states(to_cut, teacher_scores)
+                # States that are completed do not need further cutting.
+                to_cut = [state for state in to_cut if not state.is_final()]
+                length += 1
+        return states
+
+
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
         sequence or a cap. A long
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 1509c31bbba..54ee053981f 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -617,6 +617,52 @@ def test_overfitting_IO(use_upper):
     assert ents[1].kb_id == 0
 
 
+def test_is_distillable():
+    nlp = English()
+    ner = nlp.add_pipe("ner")
+    assert ner.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_ner = teacher.add_pipe("ner")
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
+        for ent in annotations.get("entities"):
+            teacher_ner.add_label(ent[2])
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["ner"] < 0.00001
+
+    student = English()
+    student_ner = student.add_pipe("ner")
+    student_ner.initialize(
+        get_examples=lambda: train_examples, labels=teacher_ner.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(100):
+        losses = {}
+        student_ner.distill(teacher_ner, distill_examples, sgd=optimizer, losses=losses)
+    assert losses["ner"] < 0.0001
+
+    # test the trained model
+    test_text = "I like London."
+    doc = student(test_text)
+    ents = doc.ents
+    assert len(ents) == 1
+    assert ents[0].text == "London"
+    assert ents[0].label_ == "LOC"
+
+
 def test_beam_ner_scores():
     # Test that we can get confidence values out of the beam_ner pipe
     beam_width = 16
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 3565c62af0f..a943c3538e0 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -396,6 +396,55 @@ def test_overfitting_IO(pipe_name):
     assert_equal(batch_deps_1, no_batch_deps)
 
 
+def test_is_distillable():
+    nlp = English()
+    parser = nlp.add_pipe("parser")
+    assert parser.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_parser = teacher.add_pipe("parser")
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
+        for dep in annotations.get("deps", []):
+            teacher_parser.add_label(dep)
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(200):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["parser"] < 0.0001
+
+    student = English()
+    student_parser = student.add_pipe("parser")
+    student_parser.initialize(
+        get_examples=lambda: train_examples, labels=teacher_parser.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(200):
+        losses = {}
+        student_parser.distill(
+            teacher_parser, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["parser"] < 0.0001
+
+    test_text = "I like securities."
+    doc = student(test_text)
+    assert doc[0].dep_ == "nsubj"
+    assert doc[2].dep_ == "dobj"
+    assert doc[3].dep_ == "punct"
+    assert doc[0].head.i == 1
+    assert doc[2].head.i == 1
+    assert doc[3].head.i == 1
+
+
 # fmt: off
 @pytest.mark.slow
 @pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index ba2ed4e5ff3..0f204ead477 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -214,6 +214,53 @@ def test_overfitting_IO(top_k):
     assert doc4[3].lemma_ == "egg"
 
 
+def test_is_distillable():
+    nlp = English()
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    assert lemmatizer.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_lemmatizer = teacher.add_pipe("trainable_lemmatizer")
+    teacher_lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["trainable_lemmatizer"] < 0.00001
+
+    student = English()
+    student_lemmatizer = student.add_pipe("trainable_lemmatizer")
+    student_lemmatizer.min_tree_freq = 1
+    student_lemmatizer.initialize(
+        get_examples=lambda: train_examples, labels=teacher_lemmatizer.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(50):
+        losses = {}
+        student_lemmatizer.distill(
+            teacher_lemmatizer, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["trainable_lemmatizer"] < 0.00001
+
+    test_text = "She likes blue eggs"
+    doc = student(test_text)
+    assert doc[0].lemma_ == "she"
+    assert doc[1].lemma_ == "like"
+    assert doc[2].lemma_ == "blue"
+    assert doc[3].lemma_ == "egg"
+
+
 def test_lemmatizer_requires_labels():
     nlp = English()
     nlp.add_pipe("trainable_lemmatizer")
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index c2b65977ac3..fffb7b4ed7f 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -77,6 +77,12 @@ def test_implicit_label():
     nlp.initialize(get_examples=lambda: train_examples)
 
 
+def test_is_distillable():
+    nlp = English()
+    morphologizer = nlp.add_pipe("morphologizer")
+    assert morphologizer.is_distillable
+
+
 def test_no_resize():
     nlp = Language()
     morphologizer = nlp.add_pipe("morphologizer")
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 2e40d86ff48..94285178310 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -11,6 +11,12 @@
 from spacy.training import Example
 
 
+def test_is_distillable():
+    nlp = English()
+    senter = nlp.add_pipe("senter")
+    assert senter.is_distillable
+
+
 def test_label_types():
     nlp = Language()
     senter = nlp.add_pipe("senter")
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 5deb323dd71..5da5c209975 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -237,6 +237,52 @@ def test_overfitting_IO():
     assert doc3[0].tag_ != "N"
 
 
+def test_is_distillable():
+    nlp = English()
+    tagger = nlp.add_pipe("tagger")
+    assert tagger.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_tagger = teacher.add_pipe("tagger")
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["tagger"] < 0.00001
+
+    student = English()
+    student_tagger = student.add_pipe("tagger")
+    student_tagger.min_tree_freq = 1
+    student_tagger.initialize(
+        get_examples=lambda: train_examples, labels=teacher_tagger.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(50):
+        losses = {}
+        student_tagger.distill(
+            teacher_tagger, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["tagger"] < 0.00001
+
+    test_text = "I like blue eggs"
+    doc = student(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+
 def test_save_activations():
     # Test if activations are correctly added to Doc when requested.
     nlp = English()
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 710dac0571d..214c1bfbed1 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -602,6 +602,12 @@ def test_initialize_examples(name, get_examples, train_data):
         nlp.initialize(get_examples=get_examples())
 
 
+def test_is_distillable():
+    nlp = English()
+    textcat = nlp.add_pipe("textcat")
+    assert not textcat.is_distillable
+
+
 def test_overfitting_IO():
     # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
     fix_random_seed(0)
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index a492a8be358..68f7e8a0d57 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -8,16 +8,10 @@
 import spacy
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
-from spacy.training import (
-    Alignment,
-    Corpus,
-    Example,
-    biluo_tags_to_offsets,
-    biluo_tags_to_spans,
-    docs_to_json,
-    iob_to_biluo,
-    offsets_to_biluo_tags,
-)
+from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
+from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
+from spacy.training import offsets_to_biluo_tags, validate_distillation_examples
+from spacy.training.alignment_array import AlignmentArray
 from spacy.training.align import get_alignments
 from spacy.training.alignment_array import AlignmentArray
 from spacy.training.converters import json_to_docs
@@ -377,6 +371,19 @@ def test_example_from_dict_some_ner(en_vocab):
     assert ner_tags == ["U-LOC", None, None, None]
 
 
+def test_validate_distillation_examples(en_vocab):
+    words = ["a", "b", "c", "d"]
+    spaces = [True, True, False, True]
+    predicted = Doc(en_vocab, words=words, spaces=spaces)
+
+    example = Example.from_dict(predicted, {})
+    validate_distillation_examples([example], "test_validate_distillation_examples")
+
+    example = Example.from_dict(predicted, {"words": words + ["e"]})
+    with pytest.raises(ValueError, match=r"distillation"):
+        validate_distillation_examples([example], "test_validate_distillation_examples")
+
+
 @pytest.mark.filterwarnings("ignore::UserWarning")
 def test_json_to_docs_no_ner(en_vocab):
     data = [
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 5c2ba99320d..358b2bd806d 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,3 +1,6 @@
+from .corpus import Corpus, JsonlCorpus  # noqa: F401
+from .example import Example, validate_examples, validate_get_examples  # noqa: F401
+from .example import validate_distillation_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 2c1ff34cf2f..c6da5157748 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -57,6 +57,13 @@ def validate_examples(examples, method):
         raise TypeError(err)
 
 
+def validate_distillation_examples(examples, method):
+    validate_examples(examples, method)
+    for eg in examples:
+        if [token.text for token in eg.reference] != [token.text for token in eg.predicted]:
+            raise ValueError(Errors.E4003)
+
+
 def validate_get_examples(get_examples, method):
     """Check that a generator of a batch of examples received during processing is valid:
     the callable produces a non-empty list of Example objects.
diff --git a/website/docs/api/dependencyparser.mdx b/website/docs/api/dependencyparser.mdx
index 771a00aeee1..5179ce48b84 100644
--- a/website/docs/api/dependencyparser.mdx
+++ b/website/docs/api/dependencyparser.mdx
@@ -131,6 +131,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## DependencyParser.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("parser")
+> student_pipe = student.add_pipe("parser")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## DependencyParser.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -268,6 +301,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions. ~~StateClass~~                 |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## DependencyParser.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_parser = teacher.get_pipe("parser")
+> student_parser = student.add_pipe("parser")
+> student_scores = student_parser.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_parser.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_parser.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## DependencyParser.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline
diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx
index 17af19e8c38..2e099365758 100644
--- a/website/docs/api/edittreelemmatizer.mdx
+++ b/website/docs/api/edittreelemmatizer.mdx
@@ -115,6 +115,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## EditTreeLemmatizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("trainable_lemmatizer")
+> student_pipe = student.add_pipe("trainable_lemmatizer")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## EditTreeLemmatizer.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -269,6 +302,27 @@ Create an optimizer for the pipeline component.
 | ----------- | ---------------------------- |
 | **RETURNS** | The optimizer. ~~Optimizer~~ |
 
+## EditTreeLemmatizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_lemmatizer = teacher.get_pipe("trainable_lemmatizer")
+> student_lemmatizer = student.add_pipe("trainable_lemmatizer")
+> student_scores = student_lemmatizer.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_lemmatizer.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_lemmatizer.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## EditTreeLemmatizer.use_params {id="use_params",tag="method, contextmanager"}
 
 Modify the pipe's model, to use the given parameter values. At the end of the
diff --git a/website/docs/api/entityrecognizer.mdx b/website/docs/api/entityrecognizer.mdx
index 1f386bbb6ff..005d5d11deb 100644
--- a/website/docs/api/entityrecognizer.mdx
+++ b/website/docs/api/entityrecognizer.mdx
@@ -127,6 +127,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## EntityRecognizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("ner")
+> student_pipe = student.add_pipe("ner")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## EntityRecognizer.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -264,6 +297,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions. ~~StateClass~~                 |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## EntityRecognizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_ner = teacher.get_pipe("ner")
+> student_ner = student.add_pipe("ner")
+> student_scores = student_ner.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_ner.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_ner.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## EntityRecognizer.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 1fda807cb32..4f79458d319 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -121,6 +121,39 @@ delegate to the [`predict`](/api/morphologizer#predict) and
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## Morphologizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("morphologizer")
+> student_pipe = student.add_pipe("morphologizer")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## Morphologizer.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -259,6 +292,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## Morphologizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_morphologizer = teacher.get_pipe("morphologizer")
+> student_morphologizer = student.add_pipe("morphologizer")
+> student_scores = student_morphologizer.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_morphologizer.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_morphologizer.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## Morphologizer.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx
index b387ea58654..120c8f6908f 100644
--- a/website/docs/api/pipe.mdx
+++ b/website/docs/api/pipe.mdx
@@ -234,6 +234,39 @@ predictions and gold-standard annotations, and update the component's model.
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
 | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
 
+## TrainablePipe.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("your_custom_pipe")
+> student_pipe = student.add_pipe("your_custom_pipe")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## TrainablePipe.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
@@ -281,6 +314,34 @@ This method needs to be overwritten with your own custom `get_loss` method.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## TrainablePipe.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+<Infobox variant="danger">
+
+This method needs to be overwritten with your own custom
+`get_teacher_student_loss` method.
+
+</Infobox>
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.get_pipe("your_custom_pipe")
+> student_pipe = student.add_pipe("your_custom_pipe")
+> student_scores = student_pipe.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_pipe.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_pipe.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## TrainablePipe.score {id="score",tag="method",version="3"}
 
 Score a batch of examples.
diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx
index d5d096d7659..02fd57102e2 100644
--- a/website/docs/api/sentencerecognizer.mdx
+++ b/website/docs/api/sentencerecognizer.mdx
@@ -106,6 +106,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## SentenceRecognizer.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("senter")
+> student_pipe = student.add_pipe("senter")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## SentenceRecognizer.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -254,6 +287,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## SentenceRecognizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_senter = teacher.get_pipe("senter")
+> student_senter = student.add_pipe("senter")
+> student_scores = student_senter.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_senter.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_senter.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## SentenceRecognizer.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index ae14df212ee..664fd7940c1 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -105,6 +105,39 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## Tagger.distill {id="distill", tag="method,experimental", version="4"}
+
+Train a pipe (the student) on the predictions of another pipe (the teacher). The
+student is typically trained on the probability distribution of the teacher, but
+details may differ per pipe. The goal of distillation is to transfer knowledge
+from the teacher to the student.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("tagger")
+> student_pipe = student.add_pipe("tagger")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## Tagger.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -265,6 +298,27 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
+## Tagger.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
+
+Calculate the loss and its gradient for the batch of student scores relative to
+the teacher scores.
+
+> #### Example
+>
+> ```python
+> teacher_tagger = teacher.get_pipe("tagger")
+> student_tagger = student.add_pipe("tagger")
+> student_scores = student_tagger.predict([eg.predicted for eg in examples])
+> teacher_scores = teacher_tagger.predict([eg.predicted for eg in examples])
+> loss, d_loss = student_tagger.get_teacher_student_loss(teacher_scores, student_scores)
+> ```
+
+| Name             | Description                                                                 |
+| ---------------- | --------------------------------------------------------------------------- |
+| `teacher_scores` | Scores representing the teacher model's predictions.                        |
+| `student_scores` | Scores representing the student model's predictions.                        |
+| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
+
 ## Tagger.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 9cdc0c8ab02..77216924405 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -1048,7 +1048,8 @@ backprop passes.
 Recursively wrap both the models and methods of each pipe using
 [NVTX](https://nvidia.github.io/NVTX/) range markers. By default, the following
 methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`,
-`get_loss`, `initialize`, `begin_update`, `finish_update`, `update`.
+`get_loss`, `get_teacher_student_loss`, `initialize`, `begin_update`,
+`finish_update`, `update`.
 
 | Name                        | Description                                                                                                                                                     |
 | --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index fb5de5da102..9dbdadd0ebc 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -1355,12 +1355,14 @@ For some use cases, it makes sense to also overwrite additional methods to
 customize how the model is updated from examples, how it's initialized, how the
 loss is calculated and to add evaluation scores to the training output.
 
-| Name                                 | Description                                                                                                                                                                                                                                                                                                                                   |
-| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`update`](/api/pipe#update)         | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                                           |
-| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
-| [`get_loss`](/api/pipe#get_loss)     | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                                                 |
-| [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_score_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score.                            |
+| Name                                                             | Description                                                                                                                                                                                                                                                                                                                                   |
+| ---------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`update`](/api/pipe#update)                                     | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                                           |
+| [`distill`](/api/pipe#distill)                                   | Learn from a teacher pipeline using a batch of [`Doc`](/api/doc) objects and update the component's model.                                                                                                                                                                                                                                    |
+| [`initialize`](/api/pipe#initialize)                             | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
+| [`get_loss`](/api/pipe#get_loss)                                 | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                                                 |
+| [`get_teacher_student_loss`](/api/pipe#get_teacher_student_loss) | Return a tuple of the loss and the gradient for the student scores relative to the teacher scores.                                                                                                                                                                                                                                            |
+| [`score`](/api/pipe#score)                                       | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_score_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score.                            |
 
 <Infobox title="Custom trainable components and models" emoji="📖">
 

From 6cc9b0160f0103abac334f6ab952bec3a30c56df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 18 Jan 2023 11:27:45 +0100
Subject: [PATCH 259/504] Merge the parser refactor into `v4` (#10940)

* Try to fix doc.copy

* Set dev version

* Make vocab always own lexemes

* Change version

* Add SpanGroups.copy method

* Fix set_annotations during Parser.update

* Fix dict proxy copy

* Upd version

* Fix copying SpanGroups

* Fix set_annotations in parser.update

* Fix parser set_annotations during update

* Revert "Fix parser set_annotations during update"

This reverts commit eb138c89edb306608826dca50619ea8a60de2b14.

* Revert "Fix set_annotations in parser.update"

This reverts commit c6df0eafd0046179c1c9fb7840074edf04e4721d.

* Fix set_annotations during parser update

* Inc version

* Handle final states in get_oracle_sequence

* Inc version

* Try to fix parser training

* Inc version

* Fix

* Inc version

* Fix parser oracle

* Inc version

* Inc version

* Fix transition has_gold

* Inc version

* Try to use real histories, not oracle

* Inc version

* Upd parser

* Inc version

* WIP on rewrite parser

* WIP refactor parser

* New progress on parser model refactor

* Prepare to remove parser_model.pyx

* Convert parser from cdef class

* Delete spacy.ml.parser_model

* Delete _precomputable_affine module

* Wire up tb_framework to new parser model

* Wire up parser model

* Uncython ner.pyx and dep_parser.pyx

* Uncython

* Work on parser model

* Support unseen_classes in parser model

* Support unseen classes in parser

* Cleaner handling of unseen classes

* Work through tests

* Keep working through errors

* Keep working through errors

* Work on parser. 15 tests failing

* Xfail beam stuff. 9 failures

* More xfail. 7 failures

* Xfail. 6 failures

* cleanup

* formatting

* fixes

* pass nO through

* Fix empty doc in update

* Hackishly fix resizing. 3 failures

* Fix redundant test. 2 failures

* Add reference version

* black formatting

* Get tests passing with reference implementation

* Fix missing prints

* Add missing file

* Improve indexing on reference implementation

* Get non-reference forward func working

* Start rigging beam back up

* removing redundant tests, cf #8106

* black formatting

* temporarily xfailing issue 4314

* make flake8 happy again

* mypy fixes

* ensure labels are added upon predict

* cleanup remnants from merge conflicts

* Improve unseen label masking

Two changes to speed up masking by ~10%:

- Use a bool array rather than an array of float32.

- Let the mask indicate whether a label was seen, rather than
  unseen. The mask is most frequently used to index scores for
  seen labels. However, since the mask marked unseen labels,
  this required computing an intermittent flipped mask.

* Write moves costs directly into numpy array (#10163)

This avoids elementwise indexing and the allocation of an additional
array.

Gives a ~15% speed improvement when using batch_by_sequence with size
32.

* Temporarily disable ner and rehearse tests

Until rehearse is implemented again in the refactored parser.

* Fix loss serialization issue (#10600)

* Fix loss serialization issue

Serialization of a model fails with:

TypeError: array(738.3855, dtype=float32) is not JSON serializable

Fix this using float conversion.

* Disable CI steps that require spacy.TransitionBasedParser.v2

After finishing the refactor, TransitionBasedParser.v2 should be
provided for backwards compat.

* Add back support for beam parsing to the refactored parser (#10633)

* Add back support for beam parsing

Beam parsing was already implemented as part of the `BeamBatch` class.
This change makes its counterpart `GreedyBatch`. Both classes are hooked
up in `TransitionModel`, selecting `GreedyBatch` when the beam size is
one, or `BeamBatch` otherwise.

* Use kwarg for beam width

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Avoid implicit default for beam_width and beam_density

* Parser.{beam,greedy}_parse: ensure labels are added

* Remove 'deprecated' comments

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Parser `StateC` optimizations (#10746)

* `StateC`: Optimizations

Avoid GIL acquisition in `__init__`
Increase default buffer capacities on init
Reduce C++ exception overhead

* Fix typo

* Replace `set::count` with `set::find`

* Add exception attribute to c'tor

* Remove unused import

* Use a power-of-two value for initial capacity
Use default-insert to init `_heads` and `_unshiftable`

* Merge `cdef` variable declarations and assignments

* Vectorize `example.get_aligned_parses` (#10789)

* `example`: Vectorize `get_aligned_parse`
Rename `numpy` import

* Convert aligned array to lists before returning

* Revert import renaming

* Elide slice arguments when selecting the entire range

* Tagger/morphologizer alignment performance optimizations (#10798)

* `example`: Unwrap `numpy` scalar arrays before passing them to `StringStore.__getitem__`

* `AlignmentArray`: Use native list as staging buffer for offset calculation

* `example`: Vectorize `get_aligned`

* Hoist inner functions out of `get_aligned`

* Replace inline `if..else` clause in assignment statement

* `AlignmentArray`: Use raw indexing into offset and data `numpy` arrays

* `example`: Replace array unique value check with `groupby`

* `example`: Correctly exclude tokens with no alignment in `_get_aligned_vectorized`
Simplify `_get_aligned_non_vectorized`

* `util`: Update `all_equal` docstring

* Explicitly use `int32_t*`

* Restore C CPU inference in the refactored parser (#10747)

* Bring back the C parsing model

The C parsing model is used for CPU inference and is still faster for
CPU inference than the forward pass of the Thinc model.

* Use C sgemm provided by the Ops implementation

* Make tb_framework module Cython, merge in C forward implementation

* TransitionModel: raise in backprop returned from forward_cpu

* Re-enable greedy parse test

* Return transition scores when forward_cpu is used

* Apply suggestions from code review

Import `Model` from `thinc.api`

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Use relative imports in tb_framework

* Don't assume a default for beam_width

* We don't have a direct dependency on BLIS anymore

* Rename forwards to _forward_{fallback,greedy_cpu}

* Require thinc >=8.1.0,<8.2.0

* tb_framework: clean up imports

* Fix return type of _get_seen_mask

* Move up _forward_greedy_cpu

* Style fixes.

* Lower thinc lowerbound to 8.1.0.dev0

* Formatting fix

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Reimplement parser rehearsal function (#10878)

* Reimplement parser rehearsal function

Before the parser refactor, rehearsal was driven by a loop in the
`rehearse` method itself. For each parsing step, the loops would:

1. Get the predictions of the teacher.
2. Get the predictions and backprop function of the student.
3. Compute the loss and backprop into the student.
4. Move the teacher and student forward with the predictions of
   the student.

In the refactored parser, we cannot perform search stepwise rehearsal
anymore, since the model now predicts all parsing steps at once.
Therefore, rehearsal is performed in the following steps:

1. Get the predictions of all parsing steps from the student, along
   with its backprop function.
2. Get the predictions from the teacher, but use the predictions of
   the student to advance the parser while doing so.
3. Compute the loss and backprop into the student.

To support the second step a new method, `advance_with_actions` is
added to `GreedyBatch`, which performs the provided parsing steps.

* tb_framework: wrap upper_W and upper_b in Linear

Thinc's Optimizer cannot handle resizing of existing parameters. Until
it does, we work around this by wrapping the weights/biases of the upper
layer of the parser model in Linear. When the upper layer is resized, we
copy over the existing parameters into a new Linear instance. This does
not trigger an error in Optimizer, because it sees the resized layer as
a new set of parameters.

* Add test for TransitionSystem.apply_actions

* Better FIXME marker

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Fixes from Madeesh

* Apply suggestions from Sofie

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remove useless assignment

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename some identifiers in the parser refactor (#10935)

* Rename _parseC to _parse_batch

* tb_framework: prefix many auxiliary functions with underscore

To clearly state the intent that they are private.

* Rename `lower` to `hidden`, `upper` to `output`

* Parser slow test fixup

We don't have TransitionBasedParser.{v1,v2} until we bring it back as a
legacy option.

* Remove last vestiges of PrecomputableAffine

This does not exist anymore as a separate layer.

* ner: re-enable sentence boundary checks

* Re-enable test that works now.

* test_ner: make loss test more strict again

* Remove commented line

* Re-enable some more beam parser tests

* Remove unused _forward_reference function

* Update for CBlas changes in Thinc 8.1.0.dev2

Bump thinc dependency to 8.1.0.dev3.

* Remove references to spacy.TransitionBasedParser.{v1,v2}

Since they will not be offered starting with spaCy v4.

* `tb_framework`: Replace references to `thinc.backends.linalg` with `CBlas`

* dont use get_array_module (#11056) (#11293)

Co-authored-by: kadarakos <kadar.akos@gmail.com>

* Move `thinc.extra.search` to `spacy.pipeline._parser_internals` (#11317)

* `search`: Move from `thinc.extra.search`
Fix NPE in `Beam.__dealloc__`

* `pytest`: Add support for executing Cython tests
Move `search` tests from thinc and patch them to run with `pytest`

* `mypy` fix

* Update comment

* `conftest`: Expose `register_cython_tests`

* Remove unused import

* Move `argmax` impls to new `_parser_utils` Cython module (#11410)

* Parser does not have to be a cdef class anymore

This also fixes validation of the initialization schema.

* Add back spacy.TransitionBasedParser.v2

* Fix a rename that was missed in #10878.

So that rehearsal tests pass.

* Remove module from setup.py that got added during the merge

* Bring back support for `update_with_oracle_cut_size` (#12086)

* Bring back support for `update_with_oracle_cut_size`

This option was available in the pre-refactor parser, but was never
implemented in the refactored parser. This option cuts transition
sequences that are longer than `update_with_oracle_cut` size into
separate sequences that have at most `update_with_oracle_cut`
transitions. The oracle (gold standard) transition sequence is used to
determine the cuts and the initial states for the additional sequences.

Applying this cut makes the batches more homogeneous in the transition
sequence lengths, making forward passes (and as a consequence training)
much faster.

Training time 1000 steps on de_core_news_lg:

- Before this change: 149s
- After this change: 68s
- Pre-refactor parser: 81s

* Fix a rename that was missed in #10878.

So that rehearsal tests pass.

* Apply suggestions from @shadeMe

* Use chained conditional

* Test with update_with_oracle_cut_size={0, 1, 5, 100}

And fix a git that occurs with a cut size of 1.

* Fix up some merge fall out

* Update parser distillation for the refactor

In the old parser, we'd iterate over the transitions in the distill
function and compute the loss/gradients on the go. In the refactored
parser, we first let the student model parse the inputs. Then we'll let
the teacher compute the transition probabilities of the states in the
student's transition sequence. We can then compute the gradients of the
student given the teacher.

* Add back spacy.TransitionBasedParser.v1 references

- Accordion in the architecture docs.
- Test in test_parse, but disabled until we have a spacy-legacy release.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: kadarakos <kadar.akos@gmail.com>
---
 setup.py                                      |   6 +-
 spacy/cli/templates/quickstart_training.jinja |  12 +-
 spacy/errors.py                               |   3 +
 spacy/ml/_precomputable_affine.py             | 164 -----
 spacy/ml/models/parser.py                     | 174 ++---
 spacy/ml/parser_model.pxd                     |  55 --
 spacy/ml/parser_model.pyx                     | 539 ---------------
 spacy/ml/tb_framework.pxd                     |  28 +
 spacy/ml/tb_framework.py                      |  51 --
 spacy/ml/tb_framework.pyx                     | 621 ++++++++++++++++++
 .../_parser_internals/_beam_utils.pyx         |   3 +-
 .../_parser_internals/_parser_utils.pxd       |   2 +
 .../_parser_internals/_parser_utils.pyx       |  22 +
 spacy/pipeline/_parser_internals/_state.pxd   |  59 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 +
 spacy/pipeline/_parser_internals/batch.pxd    |   2 +
 spacy/pipeline/_parser_internals/batch.pyx    |  52 ++
 spacy/pipeline/_parser_internals/ner.pyx      |   3 +
 .../pipeline/_parser_internals/stateclass.pyx |   7 +
 .../_parser_internals/transition_system.pxd   |   7 +
 .../_parser_internals/transition_system.pyx   |  71 ++
 .../{dep_parser.pyx => dep_parser.py}         |  17 +-
 spacy/pipeline/{ner.pyx => ner.py}            |  33 +-
 spacy/pipeline/transition_parser.pxd          |  31 -
 spacy/pipeline/transition_parser.pyx          | 509 ++++++--------
 spacy/tests/parser/test_ner.py                |  11 +-
 spacy/tests/parser/test_parse.py              |  80 ++-
 spacy/tests/pipeline/test_tok2vec.py          |   2 +-
 .../tests/serialize/test_serialize_config.py  |  56 +-
 spacy/tests/test_misc.py                      |  53 +-
 spacy/training/example.pyx                    |   1 -
 website/docs/api/architectures.mdx            |  26 +-
 website/docs/api/cli.mdx                      |   8 +-
 website/docs/api/legacy.mdx                   |   2 +-
 .../docs/usage/embeddings-transformers.mdx    |   6 +-
 35 files changed, 1293 insertions(+), 1426 deletions(-)
 delete mode 100644 spacy/ml/_precomputable_affine.py
 delete mode 100644 spacy/ml/parser_model.pxd
 delete mode 100644 spacy/ml/parser_model.pyx
 create mode 100644 spacy/ml/tb_framework.pxd
 delete mode 100644 spacy/ml/tb_framework.py
 create mode 100644 spacy/ml/tb_framework.pyx
 create mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pxd
 create mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pyx
 create mode 100644 spacy/pipeline/_parser_internals/batch.pxd
 create mode 100644 spacy/pipeline/_parser_internals/batch.pyx
 rename spacy/pipeline/{dep_parser.pyx => dep_parser.py} (97%)
 rename spacy/pipeline/{ner.pyx => ner.py} (93%)
 delete mode 100644 spacy/pipeline/transition_parser.pxd

diff --git a/setup.py b/setup.py
index 0eb529c2098..a4a87d68aec 100755
--- a/setup.py
+++ b/setup.py
@@ -32,12 +32,10 @@
     "spacy.kb.candidate",
     "spacy.kb.kb",
     "spacy.kb.kb_in_memory",
-    "spacy.ml.parser_model",
+    "spacy.ml.tb_framework",
     "spacy.morphology",
-    "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
-    "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
     "spacy.pipeline.sentencizer",
@@ -45,6 +43,7 @@
     "spacy.pipeline.tagger",
     "spacy.pipeline.transition_parser",
     "spacy.pipeline._parser_internals.arc_eager",
+    "spacy.pipeline._parser_internals.batch",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
     "spacy.pipeline._parser_internals.search",
@@ -52,6 +51,7 @@
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.pipeline._parser_internals._beam_utils",
+    "spacy.pipeline._parser_internals._parser_utils",
     "spacy.tokenizer",
     "spacy.training.align",
     "spacy.training.gold_io",
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 2817147f3e9..36325711d4d 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -90,12 +90,11 @@ grad_factor = 1.0
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = false
 nO = null
 
 [components.parser.model.tok2vec]
@@ -111,12 +110,11 @@ grad_factor = 1.0
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = false
 nO = null
 
 [components.ner.model.tok2vec]
@@ -387,12 +385,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = true
 nO = null
 
 [components.parser.model.tok2vec]
@@ -405,12 +402,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
diff --git a/spacy/errors.py b/spacy/errors.py
index 9bdb66006e5..9074a3fead8 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -215,6 +215,8 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
+    W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
+
 
 class Errors(metaclass=ErrorsWithCodes):
     E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
@@ -978,6 +980,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
     E4003 = ("Training examples for distillation must have the exact same tokens in the "
              "reference and predicted docs.")
+    E4004 = ("Backprop is not supported when is_train is not set.")
 
 
 # fmt: on
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
deleted file mode 100644
index 1c20c622b2c..00000000000
--- a/spacy/ml/_precomputable_affine.py
+++ /dev/null
@@ -1,164 +0,0 @@
-from thinc.api import Model, normal_init
-
-from ..util import registry
-
-
-@registry.layers("spacy.PrecomputableAffine.v1")
-def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
-    model = Model(
-        "precomputable_affine",
-        forward,
-        init=init,
-        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
-        params={"W": None, "b": None, "pad": None},
-        attrs={"dropout_rate": dropout},
-    )
-    return model
-
-
-def forward(model, X, is_train):
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.get_param("W")
-    # Preallocate array for layer output, including padding.
-    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
-    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
-    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
-
-    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
-    # change its shape to (nF, nO, nP) without breaking existing models. So
-    # we'll squeeze the first dimension here.
-    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
-
-    def backward(dY_ids):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nO, nP), and get back:
-        # (nB, nO, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nO, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        assert dY.ndim == 3
-        assert dY.shape[1] == nO, dY.shape
-        assert dY.shape[2] == nP, dY.shape
-        # nB = dY.shape[0]
-        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
-        Xf = X[ids]
-        Xf = Xf.reshape((Xf.shape[0], nF * nI))
-
-        model.inc_grad("b", dY.sum(axis=0))
-        dY = dY.reshape((dY.shape[0], nO * nP))
-
-        Wopfi = W.transpose((1, 2, 0, 3))
-        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
-        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
-
-        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
-        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
-        # (o, p, f, i) --> (f, o, p, i)
-        dWopfi = dWopfi.transpose((2, 0, 1, 3))
-        model.inc_grad("W", dWopfi)
-        return dXf.reshape((dXf.shape[0], nF, nI))
-
-    return Yf, backward
-
-
-def _backprop_precomputable_affine_padding(model, dY, ids):
-    nB = dY.shape[0]
-    nF = model.get_dim("nF")
-    nP = model.get_dim("nP")
-    nO = model.get_dim("nO")
-    # Backprop the "padding", used as a filler for missing values.
-    # Values that are missing are set to -1, and each state vector could
-    # have multiple missing values. The padding has different values for
-    # different missing features. The gradient of the padding vector is:
-    #
-    # for b in range(nB):
-    #     for f in range(nF):
-    #         if ids[b, f] < 0:
-    #             d_pad[f] += dY[b]
-    #
-    # Which can be rewritten as:
-    #
-    # (ids < 0).T @ dY
-    mask = model.ops.asarray(ids < 0, dtype="f")
-    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
-    return d_pad.reshape((1, nF, nO, nP))
-
-
-def init(model, X=None, Y=None):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    if model.has_param("W") and model.get_param("W").any():
-        return
-
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nO, nP, nI)
-    b = model.ops.alloc2f(nO, nP)
-    pad = model.ops.alloc4f(1, nF, nO, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
-        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors = vectors.reshape((vectors.shape[0], nO, nP))
-        vectors += b
-        vectors = model.ops.asarray(vectors)
-        if nP >= 2:
-            return model.ops.maxout(vectors)[0]
-        else:
-            return vectors * (vectors >= 0)
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = model.get_param("W").copy()
-    b = model.get_param("b").copy()
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("b", b)
-        else:
-            break
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index f6c0e565dd3..59483839206 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,18 +1,22 @@
-from typing import List, Optional, cast
-
-from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
+from typing import Optional, List, Tuple, Any
 from thinc.types import Floats2d
+from thinc.api import Model
+import warnings
 
+from ...errors import Errors, Warnings
 from ...compat import Literal
 from ...errors import Errors
 from ...tokens import Doc
 from ...util import registry
-from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
+from ...tokens.doc import Doc
 
+TransitionSystem = Any  # TODO
+State = Any  # TODO
 
-@registry.architectures("spacy.TransitionBasedParser.v2")
-def build_tb_parser_model(
+
+@registry.architectures.register("spacy.TransitionBasedParser.v2")
+def transition_parser_v2(
     tok2vec: Model[List[Doc], List[Floats2d]],
     state_type: Literal["parser", "ner"],
     extra_state_tokens: bool,
@@ -20,6 +24,46 @@ def build_tb_parser_model(
     maxout_pieces: int,
     use_upper: bool,
     nO: Optional[int] = None,
+) -> Model:
+    if not use_upper:
+        warnings.warn(Warnings.W400)
+
+    return build_tb_parser_model(
+        tok2vec,
+        state_type,
+        extra_state_tokens,
+        hidden_width,
+        maxout_pieces,
+        nO=nO,
+    )
+
+
+@registry.architectures.register("spacy.TransitionBasedParser.v3")
+def transition_parser_v3(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    state_type: Literal["parser", "ner"],
+    extra_state_tokens: bool,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+) -> Model:
+    return build_tb_parser_model(
+        tok2vec,
+        state_type,
+        extra_state_tokens,
+        hidden_width,
+        maxout_pieces,
+        nO=nO,
+    )
+
+
+def build_tb_parser_model(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    state_type: Literal["parser", "ner"],
+    extra_state_tokens: bool,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
 ) -> Model:
     """
     Build a transition-based parser model. Can apply to NER or dependency-parsing.
@@ -52,14 +96,7 @@ def build_tb_parser_model(
         feature sets (for the NER) or 13 (for the parser).
     hidden_width (int): The width of the hidden layer.
     maxout_pieces (int): How many pieces to use in the state prediction layer.
-        Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
-        is replaced with a ReLu non-linearity if use_upper=True, and no
-        non-linearity if use_upper=False.
-    use_upper (bool): Whether to use an additional hidden layer after the state
-        vector in order to predict the action scores. It is recommended to set
-        this to False for large pretrained models such as transformers, and True
-        for smaller networks. The upper layer is computed on CPU, which becomes
-        a bottleneck on larger GPU-based models, where it's also less necessary.
+        Recommended values are 1, 2 or 3.
     nO (int or None): The number of actions the model will predict between.
         Usually inferred from data at the beginning of training, or loaded from
         disk.
@@ -70,106 +107,11 @@ def build_tb_parser_model(
         nr_feature_tokens = 6 if extra_state_tokens else 3
     else:
         raise ValueError(Errors.E917.format(value=state_type))
-    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec = chain(
-        tok2vec,
-        list2array(),
-        Linear(hidden_width, t2v_width),
+    return TransitionModel(
+        tok2vec=tok2vec,
+        state_tokens=nr_feature_tokens,
+        hidden_width=hidden_width,
+        maxout_pieces=maxout_pieces,
+        nO=nO,
+        unseen_classes=set(),
     )
-    tok2vec.set_dim("nO", hidden_width)
-    lower = _define_lower(
-        nO=hidden_width if use_upper else nO,
-        nF=nr_feature_tokens,
-        nI=tok2vec.get_dim("nO"),
-        nP=maxout_pieces,
-    )
-    upper = None
-    if use_upper:
-        with use_ops("cpu"):
-            # Initialize weights at zero, as it's a classification layer.
-            upper = _define_upper(nO=nO, nI=None)
-    return TransitionModel(tok2vec, lower, upper, resize_output)
-
-
-def _define_upper(nO, nI):
-    return Linear(nO=nO, nI=nI, init_W=zero_init)
-
-
-def _define_lower(nO, nF, nI, nP):
-    return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
-
-
-def resize_output(model, new_nO):
-    if model.attrs["has_upper"]:
-        return _resize_upper(model, new_nO)
-    return _resize_lower(model, new_nO)
-
-
-def _resize_upper(model, new_nO):
-    upper = model.get_ref("upper")
-    if upper.has_dim("nO") is None:
-        upper.set_dim("nO", new_nO)
-        return model
-    elif new_nO == upper.get_dim("nO"):
-        return model
-
-    smaller = upper
-    nI = smaller.maybe_get_dim("nI")
-    with use_ops("cpu"):
-        larger = _define_upper(nO=new_nO, nI=nI)
-    # it could be that the model is not initialized yet, then skip this bit
-    if smaller.has_param("W"):
-        larger_W = larger.ops.alloc2f(new_nO, nI)
-        larger_b = larger.ops.alloc1f(new_nO)
-        smaller_W = smaller.get_param("W")
-        smaller_b = smaller.get_param("b")
-        # Weights are stored in (nr_out, nr_in) format, so we're basically
-        # just adding rows here.
-        if smaller.has_dim("nO"):
-            old_nO = smaller.get_dim("nO")
-            larger_W[:old_nO] = smaller_W
-            larger_b[:old_nO] = smaller_b
-            for i in range(old_nO, new_nO):
-                model.attrs["unseen_classes"].add(i)
-
-        larger.set_param("W", larger_W)
-        larger.set_param("b", larger_b)
-    model._layers[-1] = larger
-    model.set_ref("upper", larger)
-    return model
-
-
-def _resize_lower(model, new_nO):
-    lower = model.get_ref("lower")
-    if lower.has_dim("nO") is None:
-        lower.set_dim("nO", new_nO)
-        return model
-
-    smaller = lower
-    nI = smaller.maybe_get_dim("nI")
-    nF = smaller.maybe_get_dim("nF")
-    nP = smaller.maybe_get_dim("nP")
-    larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
-    # it could be that the model is not initialized yet, then skip this bit
-    if smaller.has_param("W"):
-        larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
-        larger_b = larger.ops.alloc2f(new_nO, nP)
-        larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
-        smaller_W = smaller.get_param("W")
-        smaller_b = smaller.get_param("b")
-        smaller_pad = smaller.get_param("pad")
-        # Copy the old weights and padding into the new layer
-        if smaller.has_dim("nO"):
-            old_nO = smaller.get_dim("nO")
-            larger_W[:, 0:old_nO, :, :] = smaller_W
-            larger_pad[:, :, 0:old_nO, :] = smaller_pad
-            larger_b[0:old_nO, :] = smaller_b
-            for i in range(old_nO, new_nO):
-                model.attrs["unseen_classes"].add(i)
-
-        larger.set_param("W", larger_W)
-        larger.set_param("b", larger_b)
-        larger.set_param("pad", larger_pad)
-    model._layers[1] = larger
-    model.set_ref("lower", larger)
-    return model
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
deleted file mode 100644
index 4d2d7b3feeb..00000000000
--- a/spacy/ml/parser_model.pxd
+++ /dev/null
@@ -1,55 +0,0 @@
-from libc.string cimport memcpy, memset
-from thinc.backends.cblas cimport CBlas
-
-from ..pipeline._parser_internals._state cimport StateC
-from ..typedefs cimport hash_t, weight_t
-
-
-cdef struct SizesC:
-    int states
-    int classes
-    int hiddens
-    int pieces
-    int feats
-    int embed_width
-
-
-cdef struct WeightsC:
-    const float* feat_weights
-    const float* feat_bias
-    const float* hidden_bias
-    const float* hidden_weights
-    const float* seen_classes
-
-
-cdef struct ActivationsC:
-    int* token_ids
-    float* unmaxed
-    float* scores
-    float* hiddens
-    int* is_valid
-    int _curr_size
-    int _max_size
-
-
-cdef WeightsC get_c_weights(model) except *
-
-cdef SizesC get_c_sizes(model, int batch_size) except *
-
-cdef ActivationsC alloc_activations(SizesC n) nogil
-
-cdef void free_activations(const ActivationsC* A) nogil
-
-cdef void predict_states(
-    CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
-) nogil
-
-cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
-
-cdef void cpu_log_loss(
-    float* d_scores,
-    const float* costs,
-    const int* is_valid,
-    const float* scores,
-    int O
-) nogil
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
deleted file mode 100644
index 10a9f0bc485..00000000000
--- a/spacy/ml/parser_model.pyx
+++ /dev/null
@@ -1,539 +0,0 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
-# cython: profile=False
-cimport numpy as np
-from libc.math cimport exp
-from libc.stdlib cimport calloc, free, realloc
-from thinc.backends.cblas cimport saxpy, sgemm
-from thinc.backends.linalg cimport Vec, VecVec
-
-import numpy
-import numpy.random
-from thinc.api import CupyOps, Model, NumpyOps
-
-from .. import util
-from ..errors import Errors
-
-from ..pipeline._parser_internals.stateclass cimport StateClass
-from ..typedefs cimport weight_t
-
-
-cdef WeightsC get_c_weights(model) except *:
-    cdef WeightsC output
-    cdef precompute_hiddens state2vec = model.state2vec
-    output.feat_weights = state2vec.get_feat_weights()
-    output.feat_bias = <const float*>state2vec.bias.data
-    cdef np.ndarray vec2scores_W
-    cdef np.ndarray vec2scores_b
-    if model.vec2scores is None:
-        output.hidden_weights = NULL
-        output.hidden_bias = NULL
-    else:
-        vec2scores_W = model.vec2scores.get_param("W")
-        vec2scores_b = model.vec2scores.get_param("b")
-        output.hidden_weights = <const float*>vec2scores_W.data
-        output.hidden_bias = <const float*>vec2scores_b.data
-    cdef np.ndarray class_mask = model._class_mask
-    output.seen_classes = <const float*>class_mask.data
-    return output
-
-
-cdef SizesC get_c_sizes(model, int batch_size) except *:
-    cdef SizesC output
-    output.states = batch_size
-    if model.vec2scores is None:
-        output.classes = model.state2vec.get_dim("nO")
-    else:
-        output.classes = model.vec2scores.get_dim("nO")
-    output.hiddens = model.state2vec.get_dim("nO")
-    output.pieces = model.state2vec.get_dim("nP")
-    output.feats = model.state2vec.get_dim("nF")
-    output.embed_width = model.tokvecs.shape[1]
-    return output
-
-
-cdef ActivationsC alloc_activations(SizesC n) nogil:
-    cdef ActivationsC A
-    memset(&A, 0, sizeof(A))
-    resize_activations(&A, n)
-    return A
-
-
-cdef void free_activations(const ActivationsC* A) nogil:
-    free(A.token_ids)
-    free(A.scores)
-    free(A.unmaxed)
-    free(A.hiddens)
-    free(A.is_valid)
-
-
-cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
-    if n.states <= A._max_size:
-        A._curr_size = n.states
-        return
-    if A._max_size == 0:
-        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
-        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
-        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
-        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    else:
-        A.token_ids = <int*>realloc(
-            A.token_ids, n.states * n.feats * sizeof(A.token_ids[0])
-        )
-        A.scores = <float*>realloc(
-            A.scores, n.states * n.classes * sizeof(A.scores[0])
-        )
-        A.unmaxed = <float*>realloc(
-            A.unmaxed, n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])
-        )
-        A.hiddens = <float*>realloc(
-            A.hiddens, n.states * n.hiddens * sizeof(A.hiddens[0])
-        )
-        A.is_valid = <int*>realloc(
-            A.is_valid, n.states * n.classes * sizeof(A.is_valid[0])
-        )
-        A._max_size = n.states
-    A._curr_size = n.states
-
-
-cdef void predict_states(
-    CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
-) nogil:
-    resize_activations(A, n)
-    for i in range(n.states):
-        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
-    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
-    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
-    sum_state_features(
-        cblas,
-        A.unmaxed,
-        W.feat_weights,
-        A.token_ids,
-        n.states,
-        n.feats,
-        n.hiddens * n.pieces
-    )
-    for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
-        for j in range(n.hiddens):
-            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = _arg_max(&A.unmaxed[index], n.pieces)
-            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
-    memset(A.scores, 0, n.states * n.classes * sizeof(float))
-    if W.hidden_weights == NULL:
-        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
-    else:
-        # Compute hidden-to-output
-        sgemm(cblas)(
-            False, True, n.states, n.classes, n.hiddens,
-            1.0, <const float *>A.hiddens, n.hiddens,
-            <const float *>W.hidden_weights, n.hiddens,
-            0.0, A.scores, n.classes
-        )
-        # Add bias
-        for i in range(n.states):
-            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
-    # Set unseen classes to minimum value
-    i = 0
-    min_ = A.scores[0]
-    for i in range(1, n.states * n.classes):
-        if A.scores[i] < min_:
-            min_ = A.scores[i]
-    for i in range(n.states):
-        for j in range(n.classes):
-            if not W.seen_classes[j]:
-                A.scores[i*n.classes+j] = min_
-
-
-cdef void sum_state_features(
-    CBlas cblas,
-    float* output,
-    const float* cached,
-    const int* token_ids,
-    int B,
-    int F,
-    int O
-) nogil:
-    cdef int idx, b, f
-    cdef const float* feature
-    padding = cached
-    cached += F * O
-    cdef int id_stride = F*O
-    cdef float one = 1.
-    for b in range(B):
-        for f in range(F):
-            if token_ids[f] < 0:
-                feature = &padding[f*O]
-            else:
-                idx = token_ids[f] * id_stride + f*O
-                feature = &cached[idx]
-            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
-        token_ids += F
-
-
-cdef void cpu_log_loss(
-    float* d_scores,
-    const float* costs,
-    const int* is_valid,
-    const float* scores,
-    int O
-) nogil:
-    """Do multi-label log loss"""
-    cdef double max_, gmax, Z, gZ
-    best = arg_max_if_gold(scores, costs, is_valid, O)
-    guess = _arg_max(scores, O)
-
-    if best == -1 or guess == -1:
-        # These shouldn't happen, but if they do, we want to make sure we don't
-        # cause an OOB access.
-        return
-    Z = 1e-10
-    gZ = 1e-10
-    max_ = scores[guess]
-    gmax = scores[best]
-    for i in range(O):
-        Z += exp(scores[i] - max_)
-        if costs[i] <= costs[best]:
-            gZ += exp(scores[i] - gmax)
-    for i in range(O):
-        if costs[i] <= costs[best]:
-            d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
-        else:
-            d_scores[i] = exp(scores[i]-max_) / Z
-
-
-cdef int arg_max_if_gold(
-    const weight_t* scores, const weight_t* costs, const int* is_valid, int n
-) nogil:
-    # Find minimum cost
-    cdef float cost = 1
-    for i in range(n):
-        if is_valid[i] and costs[i] < cost:
-            cost = costs[i]
-    # Now find best-scoring with that cost
-    cdef int best = -1
-    for i in range(n):
-        if costs[i] <= cost and is_valid[i]:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
-
-
-cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
-    cdef int best = -1
-    for i in range(n):
-        if is_valid[i] >= 1:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
-
-
-class ParserStepModel(Model):
-    def __init__(
-        self,
-        docs,
-        layers,
-        *,
-        has_upper,
-        unseen_classes=None,
-        train=True,
-        dropout=0.1
-    ):
-        Model.__init__(self, name="parser_step_model", forward=step_forward)
-        self.attrs["has_upper"] = has_upper
-        self.attrs["dropout_rate"] = dropout
-        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
-        if layers[1].get_dim("nP") >= 2:
-            activation = "maxout"
-        elif has_upper:
-            activation = None
-        else:
-            activation = "relu"
-        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
-                                            activation=activation, train=train)
-        if has_upper:
-            self.vec2scores = layers[-1]
-        else:
-            self.vec2scores = None
-        self.cuda_stream = util.get_cuda_stream(non_blocking=True)
-        self.backprops = []
-        self._class_mask = numpy.zeros((self.nO,), dtype='f')
-        self._class_mask.fill(1)
-        if unseen_classes is not None:
-            for class_ in unseen_classes:
-                self._class_mask[class_] = 0.
-
-    def clear_memory(self):
-        del self.tokvecs
-        del self.bp_tokvecs
-        del self.state2vec
-        del self.backprops
-        del self._class_mask
-
-    @property
-    def nO(self):
-        if self.attrs["has_upper"]:
-            return self.vec2scores.get_dim("nO")
-        else:
-            return self.state2vec.get_dim("nO")
-
-    def class_is_unseen(self, class_):
-        return self._class_mask[class_]
-
-    def mark_class_unseen(self, class_):
-        self._class_mask[class_] = 0
-
-    def mark_class_seen(self, class_):
-        self._class_mask[class_] = 1
-
-    def get_token_ids(self, states):
-        cdef StateClass state
-        states = [state for state in states if not state.is_final()]
-        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
-                                          dtype='i', order='C')
-        ids.fill(-1)
-        c_ids = <int*>ids.data
-        for state in states:
-            state.c.set_context_tokens(c_ids, ids.shape[1])
-            c_ids += ids.shape[1]
-        return ids
-
-    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
-        if (
-            isinstance(self.state2vec.ops, CupyOps)
-            and not isinstance(token_ids, self.state2vec.ops.xp.ndarray)
-        ):
-            # Move token_ids and d_vector to GPU, asynchronously
-            self.backprops.append((
-                util.get_async(self.cuda_stream, token_ids),
-                util.get_async(self.cuda_stream, d_vector),
-                get_d_tokvecs
-            ))
-        else:
-            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
-
-    def finish_steps(self, golds):
-        # Add a padding vector to the d_tokvecs gradient, so that missing
-        # values don't affect the real gradient.
-        d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
-        # Tells CUDA to block, so our async copies complete.
-        if self.cuda_stream is not None:
-            self.cuda_stream.synchronize()
-        for ids, d_vector, bp_vector in self.backprops:
-            d_state_features = bp_vector((d_vector, ids))
-            ids = ids.flatten()
-            d_state_features = d_state_features.reshape(
-                (ids.size, d_state_features.shape[2]))
-            self.ops.scatter_add(d_tokvecs, ids, d_state_features)
-        # Padded -- see update()
-        self.bp_tokvecs(d_tokvecs[:-1])
-        return d_tokvecs
-
-
-NUMPY_OPS = NumpyOps()
-
-
-def step_forward(model: ParserStepModel, states, is_train):
-    token_ids = model.get_token_ids(states)
-    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
-    mask = None
-    if model.attrs["has_upper"]:
-        dropout_rate = model.attrs["dropout_rate"]
-        if is_train and dropout_rate > 0:
-            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
-            vector *= mask
-        scores, get_d_vector = model.vec2scores(vector, is_train)
-    else:
-        scores = NumpyOps().asarray(vector)
-        get_d_vector = lambda d_scores: d_scores  # no-cython-lint: E731
-    # If the class is unseen, make sure its score is minimum
-    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
-
-    def backprop_parser_step(d_scores):
-        # Zero vectors for unseen classes
-        d_scores *= model._class_mask
-        d_vector = get_d_vector(d_scores)
-        if mask is not None:
-            d_vector *= mask
-        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
-        return None
-    return scores, backprop_parser_step
-
-
-cdef class precompute_hiddens:
-    """Allow a model to be "primed" by pre-computing input features in bulk.
-
-    This is used for the parser, where we want to take a batch of documents,
-    and compute vectors for each (token, position) pair. These vectors can then
-    be reused, especially for beam-search.
-
-    Let's say we're using 12 features for each state, e.g. word at start of
-    buffer, three words on stack, their children, etc. In the normal arc-eager
-    system, a document of length N is processed in 2*N states. This means we'll
-    create 2*N*12 feature vectors --- but if we pre-compute, we only need
-    N*12 vector computations. The saving for beam-search is much better:
-    if we have a beam of k, we'll normally make 2*N*12*K computations --
-    so we can save the factor k. This also gives a nice CPU/GPU division:
-    we can do all our hard maths up front, packed into large multiplications,
-    and do the hard-to-program parsing on the CPU.
-    """
-    cdef readonly int nF, nO, nP
-    cdef bint _is_synchronized
-    cdef public object ops
-    cdef public object numpy_ops
-    cdef public object _cpu_ops
-    cdef np.ndarray _features
-    cdef np.ndarray _cached
-    cdef np.ndarray bias
-    cdef object _cuda_stream
-    cdef object _bp_hiddens
-    cdef object activation
-
-    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
-                 activation="maxout", train=False):
-        gpu_cached, bp_features = lower_model(tokvecs, train)
-        cdef np.ndarray cached
-        if not isinstance(gpu_cached, numpy.ndarray):
-            # Note the passing of cuda_stream here: it lets
-            # cupy make the copy asynchronously.
-            # We then have to block before first use.
-            cached = gpu_cached.get(stream=cuda_stream)
-        else:
-            cached = gpu_cached
-        if not isinstance(lower_model.get_param("b"), numpy.ndarray):
-            self.bias = lower_model.get_param("b").get(stream=cuda_stream)
-        else:
-            self.bias = lower_model.get_param("b")
-        self.nF = cached.shape[1]
-        if lower_model.has_dim("nP"):
-            self.nP = lower_model.get_dim("nP")
-        else:
-            self.nP = 1
-        self.nO = cached.shape[2]
-        self.ops = lower_model.ops
-        self.numpy_ops = NumpyOps()
-        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
-        assert activation in (None, "relu", "maxout")
-        self.activation = activation
-        self._is_synchronized = False
-        self._cuda_stream = cuda_stream
-        self._cached = cached
-        self._bp_hiddens = bp_features
-
-    cdef const float* get_feat_weights(self) except NULL:
-        if not self._is_synchronized and self._cuda_stream is not None:
-            self._cuda_stream.synchronize()
-            self._is_synchronized = True
-        return <float*>self._cached.data
-
-    def has_dim(self, name):
-        if name == "nF":
-            return self.nF if self.nF is not None else True
-        elif name == "nP":
-            return self.nP if self.nP is not None else True
-        elif name == "nO":
-            return self.nO if self.nO is not None else True
-        else:
-            return False
-
-    def get_dim(self, name):
-        if name == "nF":
-            return self.nF
-        elif name == "nP":
-            return self.nP
-        elif name == "nO":
-            return self.nO
-        else:
-            raise ValueError(Errors.E1033.format(name=name))
-
-    def set_dim(self, name, value):
-        if name == "nF":
-            self.nF = value
-        elif name == "nP":
-            self.nP = value
-        elif name == "nO":
-            self.nO = value
-        else:
-            raise ValueError(Errors.E1033.format(name=name))
-
-    def __call__(self, X, bint is_train):
-        if is_train:
-            return self.begin_update(X)
-        else:
-            return self.predict(X), lambda X: X
-
-    def predict(self, X):
-        return self.begin_update(X)[0]
-
-    def begin_update(self, token_ids):
-        cdef np.ndarray state_vector = numpy.zeros(
-            (token_ids.shape[0], self.nO, self.nP), dtype='f')
-        # This is tricky, but (assuming GPU available);
-        # - Input to forward on CPU
-        # - Output from forward on CPU
-        # - Input to backward on GPU!
-        # - Output from backward on GPU
-        bp_hiddens = self._bp_hiddens
-
-        cdef CBlas cblas = self._cpu_ops.cblas()
-
-        feat_weights = self.get_feat_weights()
-        cdef int[:, ::1] ids = token_ids
-        sum_state_features(
-            cblas, <float*>state_vector.data,
-            feat_weights, &ids[0, 0],
-            token_ids.shape[0], self.nF, self.nO*self.nP
-        )
-        state_vector += self.bias
-        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
-
-        def backward(d_state_vector_ids):
-            d_state_vector, token_ids = d_state_vector_ids
-            d_state_vector = bp_nonlinearity(d_state_vector)
-            d_tokens = bp_hiddens((d_state_vector, token_ids))
-            return d_tokens
-        return state_vector, backward
-
-    def _nonlinearity(self, state_vector):
-        if self.activation == "maxout":
-            return self._maxout_nonlinearity(state_vector)
-        else:
-            return self._relu_nonlinearity(state_vector)
-
-    def _maxout_nonlinearity(self, state_vector):
-        state_vector, mask = self.numpy_ops.maxout(state_vector)
-        # We're outputting to CPU, but we need this variable on GPU for the
-        # backward pass.
-        mask = self.ops.asarray(mask)
-
-        def backprop_maxout(d_best):
-            return self.ops.backprop_maxout(d_best, mask, self.nP)
-
-        return state_vector, backprop_maxout
-
-    def _relu_nonlinearity(self, state_vector):
-        state_vector = state_vector.reshape((state_vector.shape[0], -1))
-        mask = state_vector >= 0.
-        state_vector *= mask
-        # We're outputting to CPU, but we need this variable on GPU for the
-        # backward pass.
-        mask = self.ops.asarray(mask)
-
-        def backprop_relu(d_best):
-            d_best *= mask
-            return d_best.reshape((d_best.shape + (1,)))
-
-        return state_vector, backprop_relu
-
-cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
-    if n_classes == 2:
-        return 0 if scores[0] > scores[1] else 1
-    cdef int i
-    cdef int best = 0
-    cdef float mode = scores[0]
-    for i in range(1, n_classes):
-        if scores[i] > mode:
-            mode = scores[i]
-            best = i
-    return best
diff --git a/spacy/ml/tb_framework.pxd b/spacy/ml/tb_framework.pxd
new file mode 100644
index 00000000000..965508519e8
--- /dev/null
+++ b/spacy/ml/tb_framework.pxd
@@ -0,0 +1,28 @@
+from libc.stdint cimport int8_t
+
+
+cdef struct SizesC:
+    int states
+    int classes
+    int hiddens
+    int pieces
+    int feats
+    int embed_width
+    int tokens
+
+
+cdef struct WeightsC:
+    const float* feat_weights
+    const float* feat_bias
+    const float* hidden_bias
+    const float* hidden_weights
+    const int8_t* seen_mask
+
+
+cdef struct ActivationsC:
+    int* token_ids
+    float* unmaxed
+    float* hiddens
+    int* is_valid
+    int _curr_size
+    int _max_size
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
deleted file mode 100644
index e351ad4e570..00000000000
--- a/spacy/ml/tb_framework.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from thinc.api import Model, noop
-
-from ..util import registry
-from .parser_model import ParserStepModel
-
-
-@registry.layers("spacy.TransitionModel.v1")
-def TransitionModel(
-    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
-):
-    """Set up a stepwise transition-based model"""
-    if upper is None:
-        has_upper = False
-        upper = noop()
-    else:
-        has_upper = True
-    # don't define nO for this object, because we can't dynamically change it
-    return Model(
-        name="parser_model",
-        forward=forward,
-        dims={"nI": tok2vec.maybe_get_dim("nI")},
-        layers=[tok2vec, lower, upper],
-        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
-        init=init,
-        attrs={
-            "has_upper": has_upper,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def forward(model, X, is_train):
-    step_model = ParserStepModel(
-        X,
-        model.layers,
-        unseen_classes=model.attrs["unseen_classes"],
-        train=is_train,
-        has_upper=model.attrs["has_upper"],
-    )
-
-    return step_model, step_model.finish_steps
-
-
-def init(model, X=None, Y=None):
-    model.get_ref("tok2vec").initialize(X=X)
-    lower = model.get_ref("lower")
-    lower.initialize()
-    if model.attrs["has_upper"]:
-        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
-        model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
new file mode 100644
index 00000000000..79be13b00bd
--- /dev/null
+++ b/spacy/ml/tb_framework.pyx
@@ -0,0 +1,621 @@
+# cython: infer_types=True, cdivision=True, boundscheck=False
+from typing import List, Tuple, Any, Optional, TypeVar, cast
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from libcpp.vector cimport vector
+import numpy
+cimport numpy as np
+from thinc.api import Model, normal_init, chain, list2array, Linear
+from thinc.api import uniform_init, glorot_uniform_init, zero_init
+from thinc.api import NumpyOps
+from thinc.backends.cblas cimport CBlas, saxpy, sgemm
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
+from thinc.types import Ints1d, Ints2d
+
+from ..errors import Errors
+from ..pipeline._parser_internals import _beam_utils
+from ..pipeline._parser_internals.batch import GreedyBatch
+from ..pipeline._parser_internals._parser_utils cimport arg_max
+from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
+from ..pipeline._parser_internals.transition_system cimport TransitionSystem
+from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..tokens.doc import Doc
+from ..util import registry
+
+
+State = Any  # TODO
+
+
+@registry.layers("spacy.TransitionModel.v2")
+def TransitionModel(
+    *,
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    beam_width: int = 1,
+    beam_density: float = 0.0,
+    state_tokens: int,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+    unseen_classes=set(),
+) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
+    """Set up a transition-based parsing model, using a maxout hidden
+    layer and a linear output layer.
+    """
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
+    tok2vec_projected.set_dim("nO", hidden_width)
+
+    # FIXME: we use `output` as a container for the output layer's
+    # weights and biases. Thinc optimizers cannot handle resizing
+    # of parameters. So, when the parser model is resized, we
+    # construct a new `output` layer, which has a different key in
+    # the optimizer. Once the optimizer supports parameter resizing,
+    # we can replace the `output` layer by `output_W` and `output_b`
+    # parameters in this model.
+    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
+
+    return Model(
+        name="parser_model",
+        forward=forward,
+        init=init,
+        layers=[tok2vec_projected, output],
+        refs={
+            "tok2vec": tok2vec_projected,
+            "output": output,
+        },
+        params={
+            "hidden_W": None,  # Floats2d W for the hidden layer
+            "hidden_b": None,  # Floats1d bias for the hidden layer
+            "hidden_pad": None,  # Floats1d padding for the hidden layer
+        },
+        dims={
+            "nO": None,  # Output size
+            "nP": maxout_pieces,
+            "nH": hidden_width,
+            "nI": tok2vec_projected.maybe_get_dim("nO"),
+            "nF": state_tokens,
+        },
+        attrs={
+            "beam_width": beam_width,
+            "beam_density": beam_density,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def resize_output(model: Model, new_nO: int) -> Model:
+    old_nO = model.maybe_get_dim("nO")
+    output = model.get_ref("output")
+    if old_nO is None:
+        model.set_dim("nO", new_nO)
+        output.set_dim("nO", new_nO)
+        output.initialize()
+        return model
+    elif new_nO <= old_nO:
+        return model
+    elif output.has_param("W"):
+        nH = model.get_dim("nH")
+        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
+        new_output.initialize()
+        new_W = new_output.get_param("W")
+        new_b = new_output.get_param("b")
+        old_W = output.get_param("W")
+        old_b = output.get_param("b")
+        new_W[:old_nO] = old_W  # type: ignore
+        new_b[:old_nO] = old_b  # type: ignore
+        for i in range(old_nO, new_nO):
+            model.attrs["unseen_classes"].add(i)
+        model.layers[-1] = new_output
+        model.set_ref("output", new_output)
+    # TODO: Avoid this private intrusion
+    model._dims["nO"] = new_nO
+    return model
+
+
+def init(
+    model,
+    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
+    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
+):
+    if X is not None:
+        docs, moves = X
+        model.get_ref("tok2vec").initialize(X=docs)
+    else:
+        model.get_ref("tok2vec").initialize()
+    inferred_nO = _infer_nO(Y)
+    if inferred_nO is not None:
+        current_nO = model.maybe_get_dim("nO")
+        if current_nO is None or current_nO != inferred_nO:
+            model.attrs["resize_output"](model, inferred_nO)
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nH = model.get_dim("nH")
+    nI = model.get_dim("nI")
+    nF = model.get_dim("nF")
+    ops = model.ops
+
+    Wl = ops.alloc2f(nH * nP, nF * nI)
+    bl = ops.alloc1f(nH * nP)
+    padl = ops.alloc1f(nI)
+    # Wl = zero_init(ops, Wl.shape)
+    Wl = glorot_uniform_init(ops, Wl.shape)
+    padl = uniform_init(ops, padl.shape)  # type: ignore
+    # TODO: Experiment with whether better to initialize output_W
+    model.set_param("hidden_W", Wl)
+    model.set_param("hidden_b", bl)
+    model.set_param("hidden_pad", padl)
+    # model = _lsuv_init(model)
+    return model
+
+
+class TransitionModelInputs:
+    """
+    Input to transition model.
+    """
+
+    # dataclass annotation is not yet supported in Cython 0.29.x,
+    # so, we'll do something close to it.
+
+    actions: Optional[List[Ints1d]]
+    docs: List[Doc]
+    max_moves: int
+    moves: TransitionSystem
+    states: Optional[List[State]]
+
+    __slots__ = [
+        "actions",
+        "docs",
+        "max_moves",
+        "moves",
+        "states",
+    ]
+
+    def __init__(
+        self,
+        docs: List[Doc],
+        moves: TransitionSystem,
+        actions: Optional[List[Ints1d]]=None,
+        max_moves: int=0,
+        states: Optional[List[State]]=None):
+        """
+        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
+        docs (List[Doc]): Docs to predict transition sequences for.
+        max_moves: (int): the maximum number of moves to apply, values less
+            than 1 will apply moves to states until they are final states.
+        moves (TransitionSystem): the transition system to use when predicting
+            the transition sequences.
+        states (Optional[List[States]]): the initial states to predict the
+            transition sequences for. When absent, the initial states are
+            initialized from the provided Docs.
+        """
+        self.actions = actions
+        self.docs = docs
+        self.moves = moves
+        self.max_moves = max_moves
+        self.states = states
+
+
+def forward(model, inputs: TransitionModelInputs, is_train: bool):
+    docs = inputs.docs
+    moves = inputs.moves
+    actions = inputs.actions
+
+    beam_width = model.attrs["beam_width"]
+    hidden_pad = model.get_param("hidden_pad")
+    tok2vec = model.get_ref("tok2vec")
+
+    states = moves.init_batch(docs) if inputs.states is None else inputs.states
+    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
+    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
+    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
+    seen_mask = _get_seen_mask(model)
+
+    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
+        # Note: max_moves is only used during training, so we don't need to
+        #       pass it to the greedy inference path.
+        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
+    else:
+        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
+            feats, backprop_feats, seen_mask, is_train, actions=actions,
+            max_moves=inputs.max_moves)
+
+
+def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
+                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+    cdef vector[StateC*] c_states
+    cdef StateClass state
+    for state in states:
+        if not state.is_final():
+            c_states.push_back(state.c)
+    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
+    # Precomputed features have rows for each token, plus one for padding.
+    cdef int n_tokens = feats.shape[0] - 1
+    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
+    cdef CBlas cblas = model.ops.cblas()
+    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
+
+    def backprop(dY):
+        raise ValueError(Errors.E4004)
+
+    return (states, scores), backprop
+
+cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
+                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
+    cdef int i, j
+    cdef vector[StateC *] unfinished
+    cdef ActivationsC activations = _alloc_activations(sizes)
+    cdef np.ndarray step_scores
+    cdef np.ndarray step_actions
+
+    scores = []
+    while sizes.states >= 1:
+        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
+        step_actions = actions[0] if actions is not None else None
+        with nogil:
+            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
+            if actions is None:
+                # Validate actions, argmax, take action.
+                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
+                    sizes.states)
+            else:
+                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
+            for i in range(sizes.states):
+                if not states[i].is_final():
+                    unfinished.push_back(states[i])
+            for i in range(unfinished.size()):
+                states[i] = unfinished[i]
+        sizes.states = unfinished.size()
+        scores.append(step_scores)
+        unfinished.clear()
+        actions = actions[1:] if actions is not None else None
+    _free_activations(&activations)
+
+    return scores
+
+
+def _forward_fallback(
+    model: Model,
+    moves: TransitionSystem,
+    states: List[StateClass],
+    tokvecs, backprop_tok2vec,
+    feats,
+    backprop_feats,
+    seen_mask,
+    is_train: bool,
+    actions: Optional[List[Ints1d]]=None,
+    max_moves: int=0):
+    nF = model.get_dim("nF")
+    output = model.get_ref("output")
+    hidden_b = model.get_param("hidden_b")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+
+    beam_width = model.attrs["beam_width"]
+    beam_density = model.attrs["beam_density"]
+
+    ops = model.ops
+
+    all_ids = []
+    all_which = []
+    all_statevecs = []
+    all_scores = []
+    if beam_width == 1:
+        batch = GreedyBatch(moves, states, None)
+    else:
+        batch = _beam_utils.BeamBatch(
+            moves, states, None, width=beam_width, density=beam_density
+        )
+    arange = ops.xp.arange(nF)
+    n_moves = 0
+    while not batch.is_done:
+        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
+        for i, state in enumerate(batch.get_unfinished_states()):
+            state.set_context_tokens(ids, i, nF)
+        # Sum the state features, add the bias and apply the activation (maxout)
+        # to create the state vectors.
+        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
+        preacts2f += hidden_b
+        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
+        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
+        statevecs, which = ops.maxout(preacts)
+        # We don't use output's backprop, since we want to backprop for
+        # all states at once, rather than a single state.
+        scores = output.predict(statevecs)
+        scores[:, seen_mask] = ops.xp.nanmin(scores)
+        # Transition the states, filtering out any that are finished.
+        cpu_scores = ops.to_numpy(scores)
+        if actions is None:
+            batch.advance(cpu_scores)
+        else:
+            batch.advance_with_actions(actions[0])
+            actions = actions[1:]
+        all_scores.append(scores)
+        if is_train:
+            # Remember intermediate results for the backprop.
+            all_ids.append(ids)
+            all_statevecs.append(statevecs)
+            all_which.append(which)
+        if n_moves >= max_moves >= 1:
+            break
+        n_moves += 1
+
+    def backprop_parser(d_states_d_scores):
+        ids = ops.xp.vstack(all_ids)
+        which = ops.xp.vstack(all_which)
+        statevecs = ops.xp.vstack(all_statevecs)
+        _, d_scores = d_states_d_scores
+        if model.attrs.get("unseen_classes"):
+            # If we have a negative gradient (i.e. the probability should
+            # increase) on any classes we filtered out as unseen, mark
+            # them as seen.
+            for clas in set(model.attrs["unseen_classes"]):
+                if (d_scores[:, clas] < 0).any():
+                    model.attrs["unseen_classes"].remove(clas)
+        d_scores *= seen_mask == False
+        # Calculate the gradients for the parameters of the output layer.
+        # The weight gemm is (nS, nO) @ (nS, nH).T
+        output.inc_grad("b", d_scores.sum(axis=0))
+        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
+        # Now calculate d_statevecs, by backproping through the output linear layer.
+        # This gemm is (nS, nO) @ (nO, nH)
+        output_W = output.get_param("W")
+        d_statevecs = ops.gemm(d_scores, output_W)
+        # Backprop through the maxout activation
+        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
+        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
+        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
+        # We don't need to backprop the summation, because we pass back the IDs instead
+        d_state_features = backprop_feats((d_preacts2f, ids))
+        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
+        ops.scatter_add(d_tokvecs, ids, d_state_features)
+        model.inc_grad("hidden_pad", d_tokvecs[-1])
+        return (backprop_tok2vec(d_tokvecs[:-1]), None)
+
+    return (list(batch), all_scores), backprop_parser
+
+
+def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
+    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
+    for class_ in model.attrs.get("unseen_classes", set()):
+        mask[class_] = True
+    return mask
+
+
+def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
+    W: Floats2d = model.get_param("hidden_W")
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
+    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
+    W3f = W3f.transpose((1, 0, 2))
+    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
+    assert X.shape == (X.shape[0], nI), X.shape
+    Yf_ = model.ops.gemm(X, W2f, trans2=True)
+    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
+
+    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nH, nP), and get back:
+        # (nB, nH, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nH, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        dXf = model.ops.gemm(dY, W)
+        Xf = X[ids].reshape((ids.shape[0], -1))
+        dW = model.ops.gemm(dY, Xf, trans1=True)
+        model.inc_grad("hidden_W", dW)
+        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
+
+    return Yf, backward
+
+
+def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
+    if Y is None:
+        return None
+    _, scores = Y
+    if len(scores) == 0:
+        return None
+    assert scores[0].shape[0] >= 1
+    assert len(scores[0].shape) == 2
+    return scores[0].shape[1]
+
+
+def _lsuv_init(model: Model):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    W = model.maybe_get_param("hidden_W")
+    if W is not None and W.any():
+        return
+
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nH, nP, nI)
+    b = model.ops.alloc2f(nH, nP)
+    pad = model.ops.alloc4f(1, nF, nH, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc_f((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc_f((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
+        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
+        vectors3f += b
+        return model.ops.maxout(vectors3f)[0]
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = cast(Floats4d, model.get_param("hidden_W").copy())
+    b = cast(Floats2d, model.get_param("hidden_b").copy())
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("hidden_W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("hidden_b", b)
+        else:
+            break
+    return model
+
+
+cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
+    output = model.get_ref("output")
+    cdef np.ndarray hidden_b = model.get_param("hidden_b")
+    cdef np.ndarray output_W = output.get_param("W")
+    cdef np.ndarray output_b = output.get_param("b")
+
+    cdef WeightsC weights
+    weights.feat_weights = feats
+    weights.feat_bias = <const float*>hidden_b.data
+    weights.hidden_weights = <const float *> output_W.data
+    weights.hidden_bias = <const float *> output_b.data
+    weights.seen_mask = <const int8_t*> seen_mask.data
+
+    return weights
+
+
+cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
+    cdef SizesC sizes
+    sizes.states = batch_size
+    sizes.classes = model.get_dim("nO")
+    sizes.hiddens = model.get_dim("nH")
+    sizes.pieces = model.get_dim("nP")
+    sizes.feats = model.get_dim("nF")
+    sizes.embed_width = model.get_dim("nI")
+    sizes.tokens = tokens
+    return sizes
+
+
+cdef ActivationsC _alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    _resize_activations(&A, n)
+    return A
+
+
+cdef void _free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
+cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
+    if n.states <= A._max_size:
+        A._curr_size = n.states
+        return
+    if A._max_size == 0:
+        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
+        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    else:
+        A.token_ids = <int*>realloc(A.token_ids,
+            n.states * n.feats * sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,
+            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>realloc(A.hiddens,
+            n.states * n.hiddens * sizeof(A.hiddens[0]))
+        A.is_valid = <int*>realloc(A.is_valid,
+            n.states * n.classes * sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    A._curr_size = n.states
+
+
+cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
+    _resize_activations(A, n)
+    for i in range(n.states):
+        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
+    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
+    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
+    for i in range(n.states):
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        for j in range(n.hiddens):
+            index = i * n.hiddens * n.pieces + j * n.pieces
+            which = arg_max(&A.unmaxed[index], n.pieces)
+            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
+    if W.hidden_weights == NULL:
+        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
+    else:
+        # Compute hidden-to-output
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
+                      1.0, <const float *>A.hiddens, n.hiddens,
+                      <const float *>W.hidden_weights, n.hiddens,
+                      0.0, scores, n.classes)
+        # Add bias
+        for i in range(n.states):
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
+    # Set unseen classes to minimum value
+    i = 0
+    min_ = scores[0]
+    for i in range(1, n.states * n.classes):
+        if scores[i] < min_:
+            min_ = scores[i]
+    for i in range(n.states):
+        for j in range(n.classes):
+            if W.seen_mask[j]:
+                scores[i*n.classes+j] = min_
+
+
+cdef void _sum_state_features(CBlas cblas, float* output,
+        const float* cached, const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    cdef int B = n.states
+    cdef int O = n.hiddens * n.pieces
+    cdef int F = n.feats
+    cdef int T = n.tokens
+    padding = cached + (T * F * O)
+    cdef int id_stride = F*O
+    cdef float one = 1.
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                feature = &padding[f*O]
+            else:
+                idx = token_ids[f] * id_stride + f*O
+                feature = &cached[idx]
+            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
+        token_ids += F
+
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index d004d313c3e..c86de231d09 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -6,6 +6,7 @@ from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
+from .batch cimport Batch
 from .search cimport Beam, MaxViolation
 from .search import MaxViolation
 from .stateclass cimport StateC, StateClass
@@ -25,7 +26,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1:
     return state.is_final()
 
 
-cdef class BeamBatch(object):
+cdef class BeamBatch(Batch):
     cdef public TransitionSystem moves
     cdef public object states
     cdef public object docs
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pxd b/spacy/pipeline/_parser_internals/_parser_utils.pxd
new file mode 100644
index 00000000000..7fee05bad60
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pxd
@@ -0,0 +1,2 @@
+cdef int arg_max(const float* scores, const int n_classes) nogil
+cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pyx b/spacy/pipeline/_parser_internals/_parser_utils.pyx
new file mode 100644
index 00000000000..582756bf5be
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pyx
@@ -0,0 +1,22 @@
+# cython: infer_types=True
+
+cdef inline int arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
+cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if is_valid[i] >= 1:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index c063cf97cd4..1b6b25e5f16 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -8,6 +8,7 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
@@ -27,7 +28,7 @@ cdef struct ArcC:
 
 
 cdef cppclass StateC:
-    int* _heads
+    vector[int] _heads
     const TokenC* _sent
     vector[int] _stack
     vector[int] _rebuffer
@@ -35,31 +36,34 @@ cdef cppclass StateC:
     unordered_map[int, vector[ArcC]] _left_arcs
     unordered_map[int, vector[ArcC]] _right_arcs
     vector[libcpp.bool] _unshiftable
+    vector[int] history
     set[int] _sent_starts
     TokenC _empty_token
     int length
     int offset
     int _b_i
 
-    __init__(const TokenC* sent, int length) nogil:
+    __init__(const TokenC* sent, int length) nogil except +:
+        this._heads.resize(length, -1)
+        this._unshiftable.resize(length, False)
+
+        # Reserve memory ahead of time to minimize allocations during parsing.
+        # The initial capacity set here ideally reflects the expected average-case/majority usage.
+        cdef int init_capacity = 32
+        this._stack.reserve(init_capacity)
+        this._rebuffer.reserve(init_capacity)
+        this._ents.reserve(init_capacity)
+        this._left_arcs.reserve(init_capacity)
+        this._right_arcs.reserve(init_capacity)
+        this.history.reserve(init_capacity)
+
         this._sent = sent
-        this._heads = <int*>calloc(length, sizeof(int))
-        if not (this._sent and this._heads):
-            with gil:
-                PyErr_SetFromErrno(MemoryError)
-                PyErr_CheckSignals()
         this.offset = 0
         this.length = length
         this._b_i = 0
-        for i in range(length):
-            this._heads[i] = -1
-            this._unshiftable.push_back(0)
         memset(&this._empty_token, 0, sizeof(TokenC))
         this._empty_token.lex = &EMPTY_LEXEME
 
-    __dealloc__():
-        free(this._heads)
-
     void set_context_tokens(int* ids, int n) nogil:
         cdef int i, j
         if n == 1:
@@ -132,19 +136,20 @@ cdef cppclass StateC:
                 ids[i] = -1
 
     int S(int i) nogil const:
-        if i >= this._stack.size():
+        cdef int stack_size = this._stack.size()
+        if i >= stack_size or i < 0:
             return -1
-        elif i < 0:
-            return -1
-        return this._stack.at(this._stack.size() - (i+1))
+        else:
+            return this._stack[stack_size - (i+1)]
 
     int B(int i) nogil const:
+        cdef int buf_size = this._rebuffer.size()
         if i < 0:
             return -1
-        elif i < this._rebuffer.size():
-            return this._rebuffer.at(this._rebuffer.size() - (i+1))
+        elif i < buf_size:
+            return this._rebuffer[buf_size - (i+1)]
         else:
-            b_i = this._b_i + (i - this._rebuffer.size())
+            b_i = this._b_i + (i - buf_size)
             if b_i >= this.length:
                 return -1
             else:
@@ -243,7 +248,7 @@ cdef cppclass StateC:
             return 0
         elif this._sent[word].sent_start == 1:
             return 1
-        elif this._sent_starts.count(word) >= 1:
+        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
             return 1
         else:
             return 0
@@ -327,7 +332,7 @@ cdef cppclass StateC:
         if item >= this._unshiftable.size():
             return 0
         else:
-            return this._unshiftable.at(item)
+            return this._unshiftable[item]
 
     void set_reshiftable(int item) nogil:
         if item < this._unshiftable.size():
@@ -347,6 +352,9 @@ cdef cppclass StateC:
         this._heads[child] = head
 
     void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
+        cdef vector[ArcC]* arcs
+        cdef ArcC* arc
+
         arcs_it = heads_arcs.find(h_i)
         if arcs_it == heads_arcs.end():
             return
@@ -355,12 +363,12 @@ cdef cppclass StateC:
         if arcs.size() == 0:
             return
 
-        arc = arcs.back()
+        arc = &arcs.back()
         if arc.head == h_i and arc.child == c_i:
             arcs.pop_back()
         else:
             for i in range(arcs.size()-1):
-                arc = arcs.at(i)
+                arc = &deref(arcs)[i]
                 if arc.head == h_i and arc.child == c_i:
                     arc.head = -1
                     arc.child = -1
@@ -400,10 +408,11 @@ cdef cppclass StateC:
         this._rebuffer = src._rebuffer
         this._sent_starts = src._sent_starts
         this._unshiftable = src._unshiftable
-        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
+        this._heads = src._heads
         this._ents = src._ents
         this._left_arcs = src._left_arcs
         this._right_arcs = src._right_arcs
         this._b_i = src._b_i
         this.offset = src.offset
         this._empty_token = src._empty_token
+        this.history = src.history
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 10f2649baa0..673e36bf5ac 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -778,6 +778,8 @@ cdef class ArcEager(TransitionSystem):
         return list(arcs)
 
     def has_gold(self, Example eg, start=0, end=None):
+        if end is not None and end < 0:
+            end = None
         for word in eg.y[start:end]:
             if word.dep != 0:
                 return True
@@ -862,6 +864,7 @@ cdef class ArcEager(TransitionSystem):
                             state.print_state()
                         )))
                     action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                     break
             else:
                 failed = False
diff --git a/spacy/pipeline/_parser_internals/batch.pxd b/spacy/pipeline/_parser_internals/batch.pxd
new file mode 100644
index 00000000000..60734e549aa
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/batch.pxd
@@ -0,0 +1,2 @@
+cdef class Batch:
+    pass
diff --git a/spacy/pipeline/_parser_internals/batch.pyx b/spacy/pipeline/_parser_internals/batch.pyx
new file mode 100644
index 00000000000..91073b52e68
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/batch.pyx
@@ -0,0 +1,52 @@
+from typing import Any
+
+TransitionSystem = Any  # TODO
+
+cdef class Batch:
+    def advance(self, scores):
+        raise NotImplementedError
+
+    def get_states(self):
+        raise NotImplementedError
+
+    @property
+    def is_done(self):
+        raise NotImplementedError
+
+    def get_unfinished_states(self):
+        raise NotImplementedError
+
+    def __getitem__(self, i):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+
+class GreedyBatch(Batch):
+    def __init__(self, moves: TransitionSystem, states, golds):
+        self._moves = moves
+        self._states = states
+        self._next_states = [s for s in states if not s.is_final()]
+
+    def advance(self, scores):
+        self._next_states = self._moves.transition_states(self._next_states, scores)
+
+    def advance_with_actions(self, actions):
+        self._next_states = self._moves.apply_actions(self._next_states, actions)
+
+    def get_states(self):
+        return self._states
+
+    @property
+    def is_done(self):
+        return all(s.is_final() for s in self._states)
+
+    def get_unfinished_states(self):
+        return [st for st in self._states if not st.is_final()]
+
+    def __getitem__(self, i):
+        return self._states[i]
+
+    def __len__(self):
+        return len(self._states)
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 6851f9f2096..cf19c834ed9 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -311,6 +311,8 @@ cdef class BiluoPushDown(TransitionSystem):
             for span in eg.y.spans.get(neg_key, []):
                 if span.start >= start and span.end <= end:
                     return True
+        if end is not None and end < 0:
+            end = None
         for word in eg.y[start:end]:
             if word.ent_iob != 0:
                 return True
@@ -648,6 +650,7 @@ cdef class Unit:
         return cost
 
 
+
 cdef class Out:
     @staticmethod
     cdef bint is_valid(const StateC* st, attr_t label) nogil:
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index 24b9f1adc33..e49ff63c48b 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -20,6 +20,10 @@ cdef class StateClass:
         if self._borrowed != 1:
             del self.c
 
+    @property
+    def history(self):
+        return list(self.c.history)
+
     @property
     def stack(self):
         return [self.S(i) for i in range(self.c.stack_depth())]
@@ -176,3 +180,6 @@ cdef class StateClass:
 
     def clone(self, StateClass src):
         self.c.clone(src.c)
+
+    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
+        self.c.set_context_tokens(&output[row, 0], n_feats)
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 04cd10d8864..66cc7747b69 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -57,3 +57,10 @@ cdef class TransitionSystem:
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        const StateC* state, gold) except -1
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+        int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index e035053b314..d1340d68c62 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -3,6 +3,8 @@
 from __future__ import print_function
 
 from cymem.cymem cimport Pool
+from libc.stdlib cimport calloc, free
+from libcpp.vector cimport vector
 
 from collections import Counter
 
@@ -11,6 +13,7 @@ import srsly
 from ...structs cimport TokenC
 from ...typedefs cimport attr_t, weight_t
 from .stateclass cimport StateClass
+from ._parser_utils cimport arg_max_if_valid
 
 from ... import util
 from ...errors import Errors
@@ -74,7 +77,18 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
+    def follow_history(self, doc, history):
+        cdef int clas
+        cdef StateClass state = StateClass(doc)
+        for clas in history:
+            action = self.c[clas]
+            action.do(state.c, action.label)
+            state.c.history.push_back(clas)
+        return state
+
     def get_oracle_sequence(self, Example example, _debug=False):
+        if not self.has_gold(example):
+            return []
         states, golds, _ = self.init_gold_batch([example])
         if not states:
             return []
@@ -86,6 +100,8 @@ cdef class TransitionSystem:
             return self.get_oracle_sequence_from_state(state, gold)
 
     def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
+        if state.is_final():
+            return []
         cdef Pool mem = Pool()
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         assert self.n_moves > 0
@@ -111,6 +127,7 @@ cdef class TransitionSystem:
                             "S0 head?", str(state.has_head(state.S(0))),
                         )))
                     action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                     break
             else:
                 if _debug:
@@ -138,6 +155,28 @@ cdef class TransitionSystem:
             raise ValueError(Errors.E170.format(name=name))
         action = self.lookup_transition(name)
         action.do(state.c, action.label)
+        state.c.history.push_back(action.clas)
+
+    def apply_actions(self, states, const int[::1] actions):
+        assert len(states) == actions.shape[0]
+        cdef StateClass state
+        cdef vector[StateC*] c_states
+        c_states.resize(len(states))
+        cdef int i
+        for (i, state) in enumerate(states):
+            c_states[i] = state.c
+        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
+        return [state for state in states if not state.c.is_final()]
+
+    def transition_states(self, states, float[:, ::1] scores):
+        assert len(states) == scores.shape[0]
+        cdef StateClass state
+        cdef float* c_scores = &scores[0, 0]
+        cdef vector[StateC*] c_states
+        for state in states:
+            c_states.push_back(state.c)
+        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
+        return [state for state in states if not state.c.is_final()]
 
     cdef Transition lookup_transition(self, object name) except *:
         raise NotImplementedError
@@ -250,3 +289,35 @@ cdef class TransitionSystem:
             self.cfg.update(msg['cfg'])
         self.initialize_actions(labels)
         return self
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil:
+        cdef int i
+        cdef Transition action
+        cdef StateC* state
+        for i in range(batch_size):
+            state = states[i]
+            action = moves.c[actions[i]]
+            action.do(state, action.label)
+            state.history.push_back(action.clas)
+
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+    int nr_class, int batch_size) nogil:
+    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
+    cdef int i, guess
+    cdef Transition action
+    for i in range(batch_size):
+        moves.set_valid(is_valid, states[i])
+        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
+        if guess == -1:
+            # This shouldn't happen, but it's hard to raise an error here,
+            # and we don't want to infinite loop. So, force to end state.
+            states[i].force_final()
+        else:
+            action = moves.c[guess]
+            action.do(states[i], action.label)
+            states[i].history.push_back(guess)
+    free(is_valid)
+
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.py
similarity index 97%
rename from spacy/pipeline/dep_parser.pyx
rename to spacy/pipeline/dep_parser.py
index 18a220bd631..370a698c25a 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.py
@@ -5,6 +5,8 @@
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
+from ._parser_internals.arc_eager import ArcEager
 
 from ._parser_internals.arc_eager cimport ArcEager
 from .transition_parser cimport Parser
@@ -19,12 +21,11 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -230,6 +231,7 @@ def parser_score(examples, **kwargs):
 
     DOCS: https://spacy.io/api/dependencyparser#score
     """
+
     def has_sents(doc):
         return doc.has_annotation("SENT_START")
 
@@ -237,8 +239,11 @@ def dep_getter(token, attr):
         dep = getattr(token, attr)
         dep = token.vocab.strings.as_string(dep).lower()
         return dep
+
     results = {}
-    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
+    results.update(
+        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
+    )
     kwargs.setdefault("getter", dep_getter)
     kwargs.setdefault("ignore_labels", ("p", "punct"))
     results.update(Scorer.score_deps(examples, "dep", **kwargs))
@@ -251,11 +256,12 @@ def make_parser_scorer():
     return parser_score
 
 
-cdef class DependencyParser(Parser):
+class DependencyParser(Parser):
     """Pipeline component for dependency parsing.
 
     DOCS: https://spacy.io/api/dependencyparser
     """
+
     TransitionSystem = ArcEager
 
     def __init__(
@@ -275,8 +281,7 @@ def __init__(
         incorrect_spans_key=None,
         scorer=parser_score,
     ):
-        """Create a DependencyParser.
-        """
+        """Create a DependencyParser."""
         super().__init__(
             vocab,
             model,
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.py
similarity index 93%
rename from spacy/pipeline/ner.pyx
rename to spacy/pipeline/ner.py
index bb009dc7a6a..4c2a3ac093c 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.py
@@ -5,6 +5,13 @@
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
+from ._parser_internals.ner import BiluoPushDown
+from ..language import Language
+from ..scorer import get_ner_prf, PRFScore
+from ..training import validate_examples
+from ..util import registry
+from ..training import remove_bilu_prefix
 
 from ._parser_internals.ner cimport BiluoPushDown
 from .transition_parser cimport Parser
@@ -16,12 +23,11 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -46,8 +52,12 @@
         "incorrect_spans_key": None,
         "scorer": {"@scorers": "spacy.ner_scorer.v1"},
     },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
-
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_ner(
     nlp: Language,
@@ -114,7 +124,12 @@ def make_ner(
         "incorrect_spans_key": None,
         "scorer": None,
     },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_beam_ner(
     nlp: Language,
@@ -188,11 +203,12 @@ def make_ner_scorer():
     return ner_score
 
 
-cdef class EntityRecognizer(Parser):
+class EntityRecognizer(Parser):
     """Pipeline component for named entity recognition.
 
     DOCS: https://spacy.io/api/entityrecognizer
     """
+
     TransitionSystem = BiluoPushDown
 
     def __init__(
@@ -210,15 +226,14 @@ def __init__(
         incorrect_spans_key=None,
         scorer=ner_score,
     ):
-        """Create an EntityRecognizer.
-        """
+        """Create an EntityRecognizer."""
         super().__init__(
             vocab,
             model,
             name,
             moves,
             update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,   # not relevant for NER
+            min_action_freq=1,  # not relevant for NER
             learn_tokens=False,  # not relevant for NER
             beam_width=beam_width,
             beam_density=beam_density,
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
deleted file mode 100644
index 7ef20563b12..00000000000
--- a/spacy/pipeline/transition_parser.pxd
+++ /dev/null
@@ -1,31 +0,0 @@
-from cymem.cymem cimport Pool
-from thinc.backends.cblas cimport CBlas
-
-from ..ml.parser_model cimport ActivationsC, SizesC, WeightsC
-from ..vocab cimport Vocab
-from ._parser_internals._state cimport StateC
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
-from .trainable_pipe cimport TrainablePipe
-
-
-cdef class Parser(TrainablePipe):
-    cdef public object _rehearsal_model
-    cdef readonly TransitionSystem moves
-    cdef public object _multitasks
-    cdef object _cpu_ops
-
-    cdef void _parseC(
-        self,
-        CBlas cblas,
-        StateC** states,
-        WeightsC weights,
-        SizesC sizes
-    ) nogil
-
-    cdef void c_transition_batch(
-        self,
-        StateC** states,
-        const float* scores,
-        int nr_class,
-        int batch_size
-    ) nogil
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index feab7e7404b..d71a4ab0355 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -13,30 +13,29 @@ from libc.string cimport memset
 from libcpp.vector cimport vector
 
 import random
+import contextlib
 
 import srsly
 from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
-from thinc.api import chain, softmax_activation, use_ops
+from thinc.api import chain, softmax_activation, use_ops, get_array_module
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d
+from thinc.types import Floats2d, Ints1d
 import numpy.random
 import numpy
 import numpy.random
 import srsly
 from thinc.api import CupyOps, NumpyOps, set_dropout_rate
 
-from ._parser_internals.stateclass cimport StateClass
+from ..ml.tb_framework import TransitionModelInputs
+from ._parser_internals.stateclass cimport StateC, StateClass
 from ._parser_internals.search cimport Beam
-from ..ml.parser_model cimport alloc_activations, free_activations
-from ..ml.parser_model cimport predict_states, arg_max_if_valid
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
-from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
-from ._parser_internals.stateclass cimport StateClass
-
-from .trainable_pipe import TrainablePipe
-
+from .trainable_pipe cimport TrainablePipe
 from ._parser_internals cimport _beam_utils
+from ._parser_internals import _beam_utils
+from ..vocab cimport Vocab
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ..typedefs cimport weight_t
 
 from ..training import validate_examples, validate_get_examples
 from ..training import validate_distillation_examples
@@ -49,7 +48,7 @@ from ._parser_internals import _beam_utils
 NUMPY_OPS = NumpyOps()
 
 
-cdef class Parser(TrainablePipe):
+class Parser(TrainablePipe):
     """
     Base class of the DependencyParser and EntityRecognizer.
     """
@@ -149,8 +148,9 @@ cdef class Parser(TrainablePipe):
     @property
     def move_names(self):
         names = []
+        cdef TransitionSystem moves = self.moves
         for i in range(self.moves.n_moves):
-            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
+            name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
             # Explicitly removing the internal "U-" token used for blocking entities
             if name != "U-":
                 names.append(name)
@@ -256,15 +256,6 @@ cdef class Parser(TrainablePipe):
 
         student_docs = [eg.predicted for eg in examples]
 
-        teacher_step_model = teacher_pipe.model.predict([eg.reference for eg in examples])
-        student_step_model, backprop_tok2vec = self.model.begin_update(student_docs)
-
-        # Add softmax activation, so that we can compute student losses
-        # with cross-entropy loss.
-        with use_ops("numpy"):
-            teacher_model = chain(teacher_step_model, softmax_activation())
-            student_model = chain(student_step_model, softmax_activation())
-        
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
@@ -272,51 +263,39 @@ cdef class Parser(TrainablePipe):
             # sequence, we use the teacher's predictions as the gold
             # standard.
             max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states = self._init_batch(teacher_step_model, student_docs, max_moves)
+            states = self._init_batch(teacher_pipe, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
-        loss = 0.0
-        n_moves = 0
-        while states:
-            # We do distillation as follows: (1) for every state, we compute the
-            # transition softmax distributions: (2) we backpropagate the error of
-            # the student (compared to the teacher) into the student model; (3)
-            # for all states, we move to the next state using the student's
-            # predictions.
-            teacher_scores = teacher_model.predict(states)
-            student_scores, backprop = student_model.begin_update(states)
-            state_loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
-            backprop(d_scores)
-            loss += state_loss
-            self.transition_states(states, student_scores)
-            states = [state for state in states if not state.is_final()]
-
-            # Stop when we reach the maximum number of moves, otherwise we start
-            # to process the remainder of cut sequences again.
-            if max_moves >= 1 and n_moves >= max_moves:
-                break
-            n_moves += 1
+        # We distill as follows: 1. we first let the student predict transition
+        # sequences (and the corresponding transition probabilities); (2) we
+        # let the teacher follow the student's predicted transition sequences
+        # to obtain the teacher's transition probabilities; (3) we compute the
+        # gradients of the student's transition distributions relative to the
+        # teacher's distributions.
+
+        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
+            max_moves=max_moves)
+        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
+        actions = states2actions(student_states)
+        teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
+            moves=self.moves, actions=actions)
+        (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
-        backprop_tok2vec(student_docs)
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+        backprop_scores((student_states, d_scores))
 
         if sgd is not None:
             self.finish_update(sgd)
 
         losses[self.name] += loss
 
-        del backprop
-        del backprop_tok2vec
-        teacher_step_model.clear_memory()
-        student_step_model.clear_memory()
-        del teacher_model
-        del student_model
-
         return losses
 
 
     def get_teacher_student_loss(
-        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
+        normalize: bool=False,
     ) -> Tuple[float, List[Floats2d]]:
         """Calculate the loss and its gradient for a batch of student
         scores, relative to teacher scores.
@@ -328,10 +307,28 @@ cdef class Parser(TrainablePipe):
         
         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
-        d_scores, loss = loss_func(student_scores, teacher_scores)
-        if self.model.ops.xp.isnan(loss):
-            raise ValueError(Errors.E910.format(name=self.name))
+
+        # We can't easily hook up a softmax layer in the parsing model, since
+        # the get_loss does additional masking. So, we could apply softmax
+        # manually here and use Thinc's cross-entropy loss. But it's a bit
+        # suboptimal, since we can have a lot of states that would result in
+        # many kernel launches. Futhermore the parsing model's backprop expects
+        # a XP array, so we'd have to concat the softmaxes anyway. So, like
+        # the get_loss implementation, we'll compute the loss and gradients
+        # ourselves.
+
+        teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
+            axis=-1, inplace=True)
+        student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
+            axis=-1, inplace=True)
+
+        assert teacher_scores.shape == student_scores.shape
+
+        d_scores = student_scores - teacher_scores
+        if normalize:
+            d_scores /= d_scores.shape[0]
+        loss = (d_scores**2).sum() / d_scores.size
+
         return float(loss), d_scores
 
     def init_multitask_objectives(self, get_examples, pipeline, **cfg):
@@ -354,9 +351,6 @@ cdef class Parser(TrainablePipe):
 
         stream: The sequence of documents to process.
         batch_size (int): Number of documents to accumulate into a working set.
-        error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
-            deals with a failing batch of documents. The default function just reraises
-            the exception.
 
         YIELDS (Doc): Documents, in order.
         """
@@ -377,78 +371,29 @@ cdef class Parser(TrainablePipe):
     def predict(self, docs):
         if isinstance(docs, Doc):
             docs = [docs]
+        self._ensure_labels_are_added(docs)
         if not any(len(doc) for doc in docs):
             result = self.moves.init_batch(docs)
             return result
-        if self.cfg["beam_width"] == 1:
-            return self.greedy_parse(docs, drop=0.0)
-        else:
-            return self.beam_parse(
-                docs,
-                drop=0.0,
-                beam_width=self.cfg["beam_width"],
-                beam_density=self.cfg["beam_density"]
-            )
+        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            states_or_beams, _ = self.model.predict(inputs)
+        return states_or_beams
 
     def greedy_parse(self, docs, drop=0.):
-        cdef vector[StateC*] states
-        cdef StateClass state
-        cdef CBlas cblas = self._cpu_ops.cblas()
+        self._resize()
         self._ensure_labels_are_added(docs)
-        set_dropout_rate(self.model, drop)
-        batch = self.moves.init_batch(docs)
-        model = self.model.predict(docs)
-        weights = get_c_weights(model)
-        for state in batch:
-            if not state.is_final():
-                states.push_back(state.c)
-        sizes = get_c_sizes(model, states.size())
-        with nogil:
-            self._parseC(cblas, &states[0], weights, sizes)
-        model.clear_memory()
-        del model
-        return batch
+        with _change_attrs(self.model, beam_width=1):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            states, _ = self.model.predict(inputs)
+        return states
 
     def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
         self._ensure_labels_are_added(docs)
-        batch = _beam_utils.BeamBatch(
-            self.moves,
-            self.moves.init_batch(docs),
-            None,
-            beam_width,
-            density=beam_density
-        )
-        model = self.model.predict(docs)
-        while not batch.is_done:
-            states = batch.get_unfinished_states()
-            if not states:
-                break
-            scores = model.predict(states)
-            batch.advance(scores)
-        model.clear_memory()
-        del model
-        return list(batch)
-
-    cdef void _parseC(
-        self, CBlas cblas, StateC** states, WeightsC weights, SizesC sizes
-    ) nogil:
-        cdef int i
-        cdef vector[StateC*] unfinished
-        cdef ActivationsC activations = alloc_activations(sizes)
-        while sizes.states >= 1:
-            predict_states(cblas, &activations, states, &weights, sizes)
-            # Validate actions, argmax, take action.
-            self.c_transition_batch(
-                states, activations.scores, sizes.classes, sizes.states
-            )
-            for i in range(sizes.states):
-                if not states[i].is_final():
-                    unfinished.push_back(states[i])
-            for i in range(unfinished.size()):
-                states[i] = unfinished[i]
-            sizes.states = unfinished.size()
-            unfinished.clear()
-        free_activations(&activations)
+        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            beams, _ = self.model.predict(inputs)
+        return beams
 
     def set_annotations(self, docs, states_or_beams):
         cdef StateClass state
@@ -459,40 +404,6 @@ cdef class Parser(TrainablePipe):
             for hook in self.postprocesses:
                 hook(doc)
 
-    def transition_states(self, states, float[:, ::1] scores):
-        cdef StateClass state
-        cdef float* c_scores = &scores[0, 0]
-        cdef vector[StateC*] c_states
-        for state in states:
-            c_states.push_back(state.c)
-        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
-        return [state for state in states if not state.c.is_final()]
-
-    cdef void c_transition_batch(
-        self,
-        StateC** states,
-        const float* scores,
-        int nr_class,
-        int batch_size
-    ) nogil:
-        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        with gil:
-            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
-        is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
-        cdef int i, guess
-        cdef Transition action
-        for i in range(batch_size):
-            self.moves.set_valid(is_valid, states[i])
-            guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
-            if guess == -1:
-                # This shouldn't happen, but it's hard to raise an error here,
-                # and we don't want to infinite loop. So, force to end state.
-                states[i].force_final()
-            else:
-                action = self.moves.c[guess]
-                action.do(states[i], action.label)
-        free(is_valid)
-
     def update(self, examples, *, drop=0., sgd=None, losses=None):
         if losses is None:
             losses = {}
@@ -503,66 +414,99 @@ cdef class Parser(TrainablePipe):
         )
         for multitask in self._multitasks:
             multitask.update(examples, drop=drop, sgd=sgd)
+        # We need to take care to act on the whole batch, because we might be
+        # getting vectors via a listener.
         n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
         if n_examples == 0:
             return losses
         set_dropout_rate(self.model, drop)
-        # The probability we use beam update, instead of falling back to
-        # a greedy update
-        beam_update_prob = self.cfg["beam_update_prob"]
-        if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
-            return self.update_beam(
-                examples,
-                beam_width=self.cfg["beam_width"],
-                sgd=sgd,
-                losses=losses,
-                beam_density=self.cfg["beam_density"]
-            )
+        docs = [eg.x for eg in examples if len(eg.x)]
+
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
             # batch uniform length.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states, golds, _ = self._init_gold_batch(
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
+            init_states, gold_states, _ = self._init_gold_batch(
                 examples,
                 max_length=max_moves
             )
         else:
-            states, golds, _ = self.moves.init_gold_batch(examples)
-        if not states:
-            return losses
-        model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
-
-        states_golds = list(zip(states, golds))
-        n_moves = 0
-        while states_golds:
-            states, golds = zip(*states_golds)
-            scores, backprop = model.begin_update(states)
-            d_scores = self.get_batch_loss(states, golds, scores, losses)
-            # Note that the gradient isn't normalized by the batch size
-            # here, because our "samples" are really the states...But we
-            # can't normalize by the number of states either, as then we'd
-            # be getting smaller gradients for states in long sequences.
-            backprop(d_scores)
-            # Follow the predicted action
-            self.transition_states(states, scores)
-            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
-            if max_moves >= 1 and n_moves >= max_moves:
-                break
-            n_moves += 1
+            init_states, gold_states, _ = self.moves.init_gold_batch(examples)
 
-        backprop_tok2vec(golds)
+        inputs = TransitionModelInputs(docs=docs, moves=self.moves,
+            max_moves=max_moves, states=[state.copy() for state in init_states])
+        (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
+        if sum(s.shape[0] for s in scores) == 0:
+            return losses
+        d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
+            examples, max_moves)
+        backprop_scores((pred_states, d_scores))
         if sgd not in (None, False):
             self.finish_update(sgd)
+        losses[self.name] += float((d_scores**2).sum())
         # Ugh, this is annoying. If we're working on GPU, we want to free the
         # memory ASAP. It seems that Python doesn't necessarily get around to
         # removing these in time if we don't explicitly delete? It's confusing.
-        del backprop
-        del backprop_tok2vec
-        model.clear_memory()
-        del model
+        del backprop_scores
         return losses
 
+    def get_loss(self, states_scores, examples, max_moves):
+        gold_states, init_states, pred_states, scores = states_scores
+        scores = self.model.ops.xp.vstack(scores)
+        costs = self._get_costs_from_histories(
+            examples,
+            gold_states,
+            init_states,
+            [list(state.history) for state in pred_states],
+            max_moves
+        )
+        xp = get_array_module(scores)
+        best_costs = costs.min(axis=1, keepdims=True)
+        gscores = scores.copy()
+        min_score = scores.min() - 1000
+        assert costs.shape == scores.shape, (costs.shape, scores.shape)
+        gscores[costs > best_costs] = min_score
+        max_ = scores.max(axis=1, keepdims=True)
+        gmax = gscores.max(axis=1, keepdims=True)
+        exp_scores = xp.exp(scores - max_)
+        exp_gscores = xp.exp(gscores - gmax)
+        Z = exp_scores.sum(axis=1, keepdims=True)
+        gZ = exp_gscores.sum(axis=1, keepdims=True)
+        d_scores = exp_scores / Z
+        d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
+        return d_scores
+
+    def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
+        cdef TransitionSystem moves = self.moves
+        cdef StateClass state
+        cdef int clas
+        cdef int nF = self.model.get_dim("nF")
+        cdef int nO = moves.n_moves
+        cdef int nS = sum([len(history) for history in histories])
+        cdef Pool mem = Pool()
+        cdef np.ndarray costs_i
+        is_valid = <int*>mem.alloc(nO, sizeof(int))
+        batch = list(zip(init_states, histories, gold_states))
+        n_moves = 0
+        output = []
+        while batch:
+            costs = numpy.zeros((len(batch), nO), dtype="f")
+            for i, (state, history, gold) in enumerate(batch):
+                costs_i = costs[i]
+                clas = history.pop(0)
+                moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
+                action = moves.c[clas]
+                action.do(state.c, action.label)
+                state.c.history.push_back(clas)
+            output.append(costs)
+            batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
+            if n_moves >= max_moves >= 1:
+                break
+            n_moves += 1
+
+        return self.model.ops.xp.vstack(output)
+
     def rehearse(self, examples, sgd=None, losses=None, **cfg):
         """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
         if losses is None:
@@ -572,10 +516,9 @@ cdef class Parser(TrainablePipe):
                 multitask.rehearse(examples, losses=losses, sgd=sgd)
         if self._rehearsal_model is None:
             return None
-        losses.setdefault(self.name, 0.)
+        losses.setdefault(self.name, 0.0)
         validate_examples(examples, "Parser.rehearse")
         docs = [eg.predicted for eg in examples]
-        states = self.moves.init_batch(docs)
         # This is pretty dirty, but the NER can resize itself in init_batch,
         # if labels are missing. We therefore have to check whether we need to
         # expand our model output.
@@ -583,95 +526,33 @@ cdef class Parser(TrainablePipe):
         # Prepare the stepwise model, and get the callback for finishing the batch
         set_dropout_rate(self._rehearsal_model, 0.0)
         set_dropout_rate(self.model, 0.0)
-        tutor, _ = self._rehearsal_model.begin_update(docs)
-        model, backprop_tok2vec = self.model.begin_update(docs)
-        n_scores = 0.
-        loss = 0.
-        while states:
-            targets, _ = tutor.begin_update(states)
-            guesses, backprop = model.begin_update(states)
-            d_scores = (guesses - targets) / targets.shape[0]
-            # If all weights for an output are 0 in the original model, don't
-            # supervise that output. This allows us to add classes.
-            loss += (d_scores**2).sum()
-            backprop(d_scores)
-            # Follow the predicted action
-            self.transition_states(states, guesses)
-            states = [state for state in states if not state.is_final()]
-            n_scores += d_scores.size
-        # Do the backprop
-        backprop_tok2vec(docs)
-        if sgd is not None:
-            self.finish_update(sgd)
-        losses[self.name] += loss / n_scores
-        del backprop
-        del backprop_tok2vec
-        model.clear_memory()
-        tutor.clear_memory()
-        del model
-        del tutor
-        return losses
+        student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
+        actions = states2actions(student_states)
+        teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
+        _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
+
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
+
+        teacher_scores = self.model.ops.xp.vstack(teacher_scores)
+        student_scores = self.model.ops.xp.vstack(student_scores)
+        assert teacher_scores.shape == student_scores.shape
+
+        d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
+        # If all weights for an output are 0 in the original model, don't
+        # supervise that output. This allows us to add classes.
+        loss = (d_scores**2).sum() / d_scores.size
+        backprop_scores((student_states, d_scores))
 
-    def update_beam(
-        self,
-        examples,
-        *,
-        beam_width,
-        drop=0.,
-        sgd=None,
-        losses=None,
-        beam_density=0.0
-    ):
-        states, golds, _ = self.moves.init_gold_batch(examples)
-        if not states:
-            return losses
-        # Prepare the stepwise model, and get the callback for finishing the batch
-        model, backprop_tok2vec = self.model.begin_update(
-            [eg.predicted for eg in examples])
-        loss = _beam_utils.update_beam(
-            self.moves,
-            states,
-            golds,
-            model,
-            beam_width,
-            beam_density=beam_density,
-        )
-        losses[self.name] += loss
-        backprop_tok2vec(golds)
         if sgd is not None:
             self.finish_update(sgd)
+        losses[self.name] += loss
 
-    def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
-        cdef StateClass state
-        cdef Pool mem = Pool()
-        cdef int i
-
-        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
+        return losses
 
-        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
-        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
-        cdef np.ndarray d_scores = numpy.zeros(
-            (len(states), self.moves.n_moves), dtype='f', order='C'
-        )
-        c_d_scores = <float*>d_scores.data
-        unseen_classes = self.model.attrs["unseen_classes"]
-        for i, (state, gold) in enumerate(zip(states, golds)):
-            memset(is_valid, 0, self.moves.n_moves * sizeof(int))
-            memset(costs, 0, self.moves.n_moves * sizeof(float))
-            self.moves.set_costs(is_valid, costs, state.c, gold)
-            for j in range(self.moves.n_moves):
-                if costs[j] <= 0.0 and j in unseen_classes:
-                    unseen_classes.remove(j)
-            cpu_log_loss(
-                c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]
-            )
-            c_d_scores += d_scores.shape[1]
-        # Note that we don't normalize this. See comment in update() for why.
-        if losses is not None:
-            losses.setdefault(self.name, 0.)
-            losses[self.name] += (d_scores**2).sum()
-        return d_scores
+    def update_beam(self, examples, *, beam_width,
+            drop=0., sgd=None, losses=None, beam_density=0.0):
+        raise NotImplementedError
 
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
@@ -710,7 +591,7 @@ cdef class Parser(TrainablePipe):
             for example in islice(get_examples(), 10):
                 doc_sample.append(example.predicted)
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize(doc_sample)
+        self.model.initialize((doc_sample, self.moves))
         if nlp is not None:
             self.init_multitask_objectives(get_examples, nlp.pipeline)
 
@@ -803,26 +684,27 @@ cdef class Parser(TrainablePipe):
 
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
-        sequence or a cap. A long
-        doc will get multiple states. Let's say we have a doc of length 2*N,
-        where N is the shortest doc. We'll make two states, one representing
-        long_doc[:N], and another representing long_doc[N:]."""
+        sequence or a cap. A long doc will get multiple states. Let's say we
+        have a doc of length 2*N, where N is the shortest doc. We'll make
+        two states, one representing long_doc[:N], and another representing
+        long_doc[N:]."""
         cdef:
             StateClass start_state
             StateClass state
             Transition action
-        all_states = self.moves.init_batch([eg.predicted for eg in examples])
+            TransitionSystem moves = self.moves
+        all_states = moves.init_batch([eg.predicted for eg in examples])
         states = []
         golds = []
         to_cut = []
         for state, eg in zip(all_states, examples):
-            if self.moves.has_gold(eg) and not state.is_final():
-                gold = self.moves.init_gold(state, eg)
+            if moves.has_gold(eg) and not state.is_final():
+                gold = moves.init_gold(state, eg)
                 if len(eg.x) < max_length:
                     states.append(state)
                     golds.append(gold)
                 else:
-                    oracle_actions = self.moves.get_oracle_sequence_from_state(
+                    oracle_actions = moves.get_oracle_sequence_from_state(
                         state.copy(), gold)
                     to_cut.append((eg, state, gold, oracle_actions))
         if not to_cut:
@@ -832,13 +714,52 @@ cdef class Parser(TrainablePipe):
             for i in range(0, len(oracle_actions), max_length):
                 start_state = state.copy()
                 for clas in oracle_actions[i:i+max_length]:
-                    action = self.moves.c[clas]
+                    action = moves.c[clas]
                     action.do(state.c, action.label)
                     if state.is_final():
                         break
-                if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
+                if moves.has_gold(eg, start_state.B(0), state.B(0)):
                     states.append(start_state)
                     golds.append(gold)
                 if state.is_final():
                     break
         return states, golds, max_length
+
+
+@contextlib.contextmanager
+def _change_attrs(model, **kwargs):
+    """Temporarily modify a thinc model's attributes."""
+    unset = object()
+    old_attrs = {}
+    for key, value in kwargs.items():
+        old_attrs[key] = model.attrs.get(key, unset)
+        model.attrs[key] = value
+    yield model
+    for key, value in old_attrs.items():
+        if value is unset:
+            model.attrs.pop(key)
+        else:
+            model.attrs[key] = value
+
+
+def states2actions(states: List[StateClass]) -> List[Ints1d]:
+    cdef int step
+    cdef StateClass state
+    cdef StateC* c_state
+    actions = []
+    while True:
+        step = len(actions)
+
+        step_actions = []
+        for state in states:
+            c_state = state.c
+            if step < c_state.history.size():
+                step_actions.append(c_state.history[step])
+
+        # We are done if we have exhausted all histories.
+        if len(step_actions) == 0:
+            break
+
+        actions.append(numpy.array(step_actions, dtype="i"))
+
+    return actions
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 54ee053981f..b2c39ae88bc 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -16,6 +16,8 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
+from thinc.api import fix_random_seed
+import logging
 
 from ..util import make_tempdir
 
@@ -412,7 +414,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -539,11 +541,11 @@ def test_block_ner():
     assert [token.ent_type_ for token in doc] == expected_types
 
 
-@pytest.mark.parametrize("use_upper", [True, False])
-def test_overfitting_IO(use_upper):
+def test_overfitting_IO():
+    fix_random_seed(1)
     # Simple test to try and quickly overfit the NER component
     nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
+    ner = nlp.add_pipe("ner", config={"model": {}})
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -575,7 +577,6 @@ def test_overfitting_IO(use_upper):
         assert ents2[0].label_ == "LOC"
         # Ensure that the predictions are still the same, even after adding a new label
         ner2 = nlp2.get_pipe("ner")
-        assert ner2.model.attrs["has_upper"] == use_upper
         ner2.add_label("RANDOM_NEW_LABEL")
         doc3 = nlp2(test_text)
         ents3 = doc3.ents
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index a943c3538e0..a6e1852514d 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,16 +1,17 @@
+import itertools
 import pytest
+import numpy
 from numpy.testing import assert_equal
 from thinc.api import Adam
 
 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.pipeline import DependencyParser
-from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
-from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
-from spacy.tokens import Doc
 from spacy.training import Example
+from spacy.tokens import Doc
 from spacy.vocab import Vocab
+from spacy import util, registry
+from thinc.api import fix_random_seed
 
 from ..util import apply_transition_sequence, make_tempdir
 
@@ -59,6 +60,8 @@
     ),
 ]
 
+PARSERS = ["parser"]  # TODO: Test beam_parser when ready
+
 eps = 0.1
 
 
@@ -171,6 +174,57 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
     assert doc[0].dep != 0
 
 
+def test_parser_apply_actions(en_vocab, en_parser):
+    words = ["I", "ate", "pizza"]
+    words2 = ["Eat", "more", "pizza", "!"]
+    doc1 = Doc(en_vocab, words=words)
+    doc2 = Doc(en_vocab, words=words2)
+    docs = [doc1, doc2]
+
+    moves = en_parser.moves
+    moves.add_action(0, "")
+    moves.add_action(1, "")
+    moves.add_action(2, "nsubj")
+    moves.add_action(3, "obj")
+    moves.add_action(2, "amod")
+
+    actions = [
+        numpy.array([0, 0], dtype="i"),
+        numpy.array([2, 0], dtype="i"),
+        numpy.array([0, 4], dtype="i"),
+        numpy.array([3, 3], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([0], dtype="i"),
+        numpy.array([1], dtype="i"),
+    ]
+
+    states = moves.init_batch(docs)
+    active_states = states
+
+    for step_actions in actions:
+        active_states = moves.apply_actions(active_states, step_actions)
+
+    assert len(active_states) == 0
+
+    for (state, doc) in zip(states, docs):
+        moves.set_annotations(state, doc)
+
+    assert docs[0][0].head.i == 1
+    assert docs[0][0].dep_ == "nsubj"
+    assert docs[0][1].head.i == 1
+    assert docs[0][1].dep_ == "ROOT"
+    assert docs[0][2].head.i == 1
+    assert docs[0][2].dep_ == "obj"
+
+    assert docs[1][0].head.i == 0
+    assert docs[1][0].dep_ == "ROOT"
+    assert docs[1][1].head.i == 2
+    assert docs[1][1].dep_ == "amod"
+    assert docs[1][2].head.i == 0
+    assert docs[1][2].dep_ == "obj"
+
+
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
@@ -319,7 +373,7 @@ def test_parser_constructor(en_vocab):
     DependencyParser(en_vocab, model)
 
 
-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+@pytest.mark.parametrize("pipe_name", PARSERS)
 def test_incomplete_data(pipe_name):
     # Test that the parser works with incomplete information
     nlp = English()
@@ -345,11 +399,15 @@ def test_incomplete_data(pipe_name):
     assert doc[2].head.i == 1
 
 
-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
-def test_overfitting_IO(pipe_name):
+@pytest.mark.parametrize(
+    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
+)
+def test_overfitting_IO(pipe_name, max_moves):
+    fix_random_seed(0)
     # Simple test to try and quickly overfit the dependency parser (normal or beam)
     nlp = English()
     parser = nlp.add_pipe(pipe_name)
+    parser.cfg["update_with_oracle_cut_size"] = max_moves
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -451,10 +509,12 @@ def test_distill():
 @pytest.mark.parametrize(
     "parser_config",
     [
-        # TransitionBasedParser V1
-        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        # TransitionBasedParser V2
+        # TODO: re-enable after we have a spacy-legacy release for v4. See
+        # https://github.com/explosion/spacy-legacy/pull/36
+        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
         ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
+        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
     ],
 )
 # fmt: on
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 998f0472c7e..9648341a106 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -384,7 +384,7 @@ def test_replace_listeners():
     factory = "ner"
 
     [components.ner.model]
-    @architectures = "spacy.TransitionBasedParser.v2"
+    @architectures = "spacy.TransitionBasedParser.v3"
 
     [components.ner.model.tok2vec]
     @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index b36d3ad7473..dd0a53c910e 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -130,33 +130,11 @@
 
 parser_config_string_upper = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
-use_upper = true
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = null
-width = 333
-depth = 4
-embed_size = 5555
-window_size = 1
-maxout_pieces = 7
-subword_features = false
-"""
-
-
-parser_config_string_no_upper = """
-[model]
-@architectures = "spacy.TransitionBasedParser.v2"
-state_type = "parser"
-extra_state_tokens = false
-hidden_width = 66
-maxout_pieces = 2
-use_upper = false
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
@@ -187,7 +165,6 @@ def my_parser():
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
-        use_upper=True,
     )
     return parser
 
@@ -293,15 +270,16 @@ def test_serialize_custom_nlp():
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        assert model.get_ref("upper").get_dim("nI") == 65
-        assert model.get_ref("lower").get_dim("nI") == 65
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("W")
+        assert output.has_param("b")
 
 
-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_serialize_parser(parser_config_string):
     """Create a non-default parser config to check nlp serializes it correctly"""
     nlp = English()
@@ -314,11 +292,13 @@ def test_serialize_parser(parser_config_string):
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        if model.attrs["has_upper"]:
-            assert model.get_ref("upper").get_dim("nI") == 66
-        assert model.get_ref("lower").get_dim("nI") == 66
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("b")
+        assert output.has_param("W")
 
 
 def test_config_nlp_roundtrip():
@@ -514,9 +494,7 @@ def test_config_auto_fill_extra_fields():
     load_model_from_config(nlp.config)
 
 
-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_config_validate_literal(parser_config_string):
     nlp = English()
     config = Config().from_str(parser_config_string)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index d2a41ff0fed..160e2ecf335 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,24 +1,13 @@
 import ctypes
 import os
 from pathlib import Path
-
-import pytest
-
-try:
-    from pydantic.v1 import ValidationError
-except ImportError:
-    from pydantic import ValidationError  # type: ignore
-
-from thinc.api import (
-    Config,
-    ConfigValidationError,
-    CupyOps,
-    MPSOps,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-    set_current_ops,
-)
+from spacy.about import __version__ as spacy_version
+from spacy import util
+from spacy import prefer_gpu, require_gpu, require_cpu
+from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
+from spacy.util import find_available_port
+from thinc.api import Config, Optimizer, ConfigValidationError
+from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
@@ -101,34 +90,6 @@ def test_util_get_package_path(package):
     assert isinstance(path, Path)
 
 
-def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
-    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
-    assert model.get_param("W").shape == (nF, nO, nP, nI)
-    tensor = model.ops.alloc((10, nI))
-    Y, get_dX = model.begin_update(tensor)
-    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
-    dY = model.ops.alloc((15, nO, nP))
-    ids = model.ops.alloc((15, nF))
-    ids[1, 2] = -1
-    dY[1] = 1
-    assert not model.has_grad("pad")
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 2, 0, 0] == 1.0
-    ids.fill(0.0)
-    dY.fill(0.0)
-    dY[0] = 0
-    ids[1, 2] = 0
-    ids[1, 1] = -1
-    ids[1, 0] = -1
-    dY[1] = 1
-    ids[2, 0] = -1
-    dY[2] = 5
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 0, 0, 0] == 6
-    assert d_pad[0, 1, 0, 0] == 1
-    assert d_pad[0, 2, 0, 0] == 0
-
-
 def test_prefer_gpu():
     current_ops = get_current_ops()
     if has_cupy_gpu:
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index c6da5157748..4c17fc8f525 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,6 +1,5 @@
 # cython: profile=False
 from collections.abc import Iterable as IterableInstance
-
 import numpy
 
 from murmurhash.mrmr cimport hash64
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 956234ac0d4..db8f974ea19 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -833,18 +833,17 @@ for a Tok2Vec layer.
 
 ## Parser & NER architectures {id="parser"}
 
-### spacy.TransitionBasedParser.v2 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
+### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TransitionBasedParser.v2"
+> @architectures = "spacy.TransitionBasedParser.v3"
 > state_type = "ner"
 > extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
-> use_upper = true
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v2"
@@ -874,23 +873,22 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
-| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
-| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
-| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
-| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
-| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
-| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
+| Name                 | Description                                                                                                                                                       |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                        |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                               |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                            |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~                                                             |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                       |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                     |
 
 <Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>
 
 [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
 same signature, but the `use_upper` argument was `True` by default.
 
-</Accordion>
+ </Accordion>
 
 ## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 47028f4a2e7..acc2ce1caa2 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -408,7 +408,7 @@ Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v3
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
@@ -418,7 +418,7 @@ Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v3
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]
@@ -743,7 +743,7 @@ scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 - hidden_width = 64
@@ -766,7 +766,7 @@ scorer = {"@scorers":"spacy.parser_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx
index b44df538766..44c80622437 100644
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@@ -302,7 +302,7 @@ the others, but may not be as accurate, especially if texts are short.
 ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 
 Identical to
-[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
+[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser)
 except the `use_upper` was set to `True` by default.
 
 ## Layers {id="layers"}
diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 2bd2856b6a3..1b0bc9606e9 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -140,7 +140,7 @@ factory = "tok2vec"
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v3"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -156,7 +156,7 @@ same. This makes them fully independent and doesn't require an upstream
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v3"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
@@ -472,7 +472,7 @@ sneakily delegates to the `Transformer` pipeline component.
 factory = "ner"
 
 [nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 128

From 3ccc72471b5aa7fec8470ccf6b9ade7380b0ae20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 18 Jan 2023 18:28:30 +0100
Subject: [PATCH 260/504] Fix batching regression (#12094)

* Fix batching regression

Some time ago, the spaCy v4 branch switched to the new Thinc v9
schedule. However, this introduced an error in how batching is handed.

In the PR, the batchers were changed to keep track of their step,
so that the step can be passed to the schedule. However, the issue
is that the training loop repeatedly calls the batching functions
(rather than using an infinite generator/iterator). So, the step and
therefore the schedule would be reset each epoch. Before the schedule
switch we didn't have this issue, because the old schedules were
stateful.

This PR fixes this issue by reverting the batching functions to use
a (stateful) generator. Their registry functions do accept a `Schedule`
and we convert `Schedule`s to generators.

* Update batcher docs

* Docstring fixes

* Make minibatch take iterables again as well

* Bump thinc requirement to 9.0.0.dev2

* Use type declaration

* Convert another comment into a proper type declaration
---
 pyproject.toml                        |  2 +-
 requirements.txt                      |  2 +-
 setup.cfg                             |  2 +-
 spacy/tests/pipeline/test_tagger.py   |  4 +-
 spacy/tests/pipeline/test_textcat.py  |  8 +++-
 spacy/tests/training/test_training.py |  4 +-
 spacy/training/batchers.py            | 58 ++++++++++++++-------------
 spacy/util.py                         |  8 ++--
 website/docs/api/top-level.mdx        | 30 +++++++-------
 9 files changed, 64 insertions(+), 54 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 58e4382e829..3891d137867 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev1,<9.1.0",
+    "thinc>=9.0.0.dev2,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 0492e3c3631..c2e3512e898 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev1,<9.1.0
+thinc>=9.0.0.dev2,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index c555218c06a..b4f5cbefc17 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,7 +38,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev1,<9.1.0
+    thinc>=9.0.0.dev2,<9.1.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 5da5c209975..b6f94f7f97b 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -24,7 +24,9 @@ def test_issue4348():
     optimizer = nlp.initialize()
     for i in range(5):
         losses = {}
-        batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+        batches = util.minibatch(
+            TRAIN_DATA, size=compounding(4.0, 32.0, 1.001).to_generator()
+        )
         for batch in batches:
             nlp.update(batch, sgd=optimizer, losses=losses)
 
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 214c1bfbed1..2383c36bb01 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -100,7 +100,9 @@ def test_issue3611():
         optimizer = nlp.initialize()
         for i in range(3):
             losses = {}
-            batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+            batches = util.minibatch(
+                train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
+            )
 
             for batch in batches:
                 nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
@@ -137,7 +139,9 @@ def test_issue4030():
         optimizer = nlp.initialize()
         for i in range(3):
             losses = {}
-            batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+            batches = util.minibatch(
+                train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
+            )
 
             for batch in batches:
                 nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 68f7e8a0d57..ef20ec365c6 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -924,7 +924,9 @@ def _train_tuples(train_data):
     optimizer = nlp.initialize()
     for i in range(5):
         losses = {}
-        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
+        batches = minibatch(
+            train_examples, size=compounding(4.0, 32.0, 1.001).to_generator()
+        )
         for batch in batches:
             nlp.update(batch, sgd=optimizer, losses=losses)
 
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index 519e61315da..469bb263016 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,9 +1,10 @@
 import itertools
-from thinc.schedules import Schedule, constant as constant_schedule
+from thinc.schedules import Schedule
 
 from ..util import minibatch, registry
 
-Sizing = Union[Sequence[int], int, Schedule[int]]
+SizingSchedule = Union[Iterable[int], int, Schedule]
+Sizing = Union[Iterable[int], int]
 ItemT = TypeVar("ItemT")
 BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
 
@@ -11,7 +12,7 @@
 @registry.batchers("spacy.batch_by_padded.v1")
 def configure_minibatch_by_padded_size(
     *,
-    size: Sizing,
+    size: SizingSchedule,
     buffer: int,
     discard_oversize: bool,
     get_length: Optional[Callable[[ItemT], int]] = None
@@ -21,8 +22,8 @@ def configure_minibatch_by_padded_size(
     The padded size is defined as the maximum length of sequences within the
     batch multiplied by the number of sequences in the batch.
 
-    size (int or Sequence[int]): The largest padded size to batch sequences into.
-        Can be a single integer, or a sequence, allowing for variable batch sizes.
+    size (int, Iterable[int] or Schedule): The largest padded size to batch sequences
+        into. Can be a single integer, or a sequence, allowing for variable batch sizes.
     buffer (int): The number of sequences to accumulate before sorting by length.
         A larger buffer will result in more even sizing, but if the buffer is
         very large, the iteration order will be less random, which can result
@@ -36,7 +37,7 @@ def configure_minibatch_by_padded_size(
     optionals = {"get_length": get_length} if get_length is not None else {}
     return partial(
         minibatch_by_padded_size,
-        size=size,
+        size=_schedule_to_sizing(size),
         buffer=buffer,
         discard_oversize=discard_oversize,
         **optionals
@@ -46,14 +47,14 @@ def configure_minibatch_by_padded_size(
 @registry.batchers("spacy.batch_by_words.v1")
 def configure_minibatch_by_words(
     *,
-    size: Sizing,
+    size: SizingSchedule,
     tolerance: float,
     discard_oversize: bool,
     get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
     """Create a batcher that uses the "minibatch by words" strategy.
 
-    size (int or Sequence[int]): The target number of words per batch.
+    size (int, Iterable[int] or Schedule): The target number of words per batch.
         Can be a single integer, or a sequence, allowing for variable batch sizes.
     tolerance (float): What percentage of the size to allow batches to exceed.
     discard_oversize (bool): Whether to discard sequences that by themselves
@@ -64,7 +65,7 @@ def configure_minibatch_by_words(
     optionals = {"get_length": get_length} if get_length is not None else {}
     return partial(
         minibatch_by_words,
-        size=size,
+        size=_schedule_to_sizing(size),
         tolerance=tolerance,
         discard_oversize=discard_oversize,
         **optionals
@@ -73,15 +74,15 @@ def configure_minibatch_by_words(
 
 @registry.batchers("spacy.batch_by_sequence.v1")
 def configure_minibatch(
-    size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
+    size: SizingSchedule, get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
     """Create a batcher that creates batches of the specified size.
 
-    size (int or Sequence[int]): The target number of items per batch.
+    size (int, Iterable[int] or Schedule): The target number of items per batch.
         Can be a single integer, or a sequence, allowing for variable batch sizes.
     """
     optionals = {"get_length": get_length} if get_length is not None else {}
-    return partial(minibatch, size=size, **optionals)
+    return partial(minibatch, size=_schedule_to_sizing(size), **optionals)
 
 
 def minibatch_by_padded_size(
@@ -97,7 +98,7 @@ def minibatch_by_padded_size(
     The padded size is defined as the maximum length of sequences within the
     batch multiplied by the number of sequences in the batch.
 
-    size (int or Sequence[int]): The largest padded size to batch sequences into.
+    size (int or Iterable[int]): The largest padded size to batch sequences into.
     buffer (int): The number of sequences to accumulate before sorting by length.
         A larger buffer will result in more even sizing, but if the buffer is
         very large, the iteration order will be less random, which can result
@@ -108,13 +109,12 @@ def minibatch_by_padded_size(
         The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_: Iterator[int] = itertools.repeat(size)
     else:
-        assert isinstance(size, Schedule)
-        size_ = size
-    for step, outer_batch in enumerate(minibatch(seqs, size=buffer)):
+        size_ = iter(size)
+    for outer_batch in minibatch(seqs, size=buffer):
         outer_batch = list(outer_batch)
-        target_size = size_(step)
+        target_size = next(size_)
         for indices in _batch_by_length(outer_batch, target_size, get_length):
             subbatch = [outer_batch[i] for i in indices]
             padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
@@ -136,7 +136,7 @@ def minibatch_by_words(
     themselves, or be discarded if discard_oversize=True.
 
     seqs (Iterable[Sequence]): The sequences to minibatch.
-    size (int or Sequence[int]): The target number of words per batch.
+    size (int or Iterable[int]): The target number of words per batch.
         Can be a single integer, or a sequence, allowing for variable batch sizes.
     tolerance (float): What percentage of the size to allow batches to exceed.
     discard_oversize (bool): Whether to discard sequences that by themselves
@@ -145,12 +145,10 @@ def minibatch_by_words(
         item. The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_: Iterator[int] = itertools.repeat(size)
     else:
-        assert isinstance(size, Schedule)
-        size_ = size
-    step = 0
-    target_size = size_(step)
+        size_ = iter(size)
+    target_size = next(size_)
     tol_size = target_size * tolerance
     batch = []
     overflow = []
@@ -175,8 +173,7 @@ def minibatch_by_words(
         else:
             if batch:
                 yield batch
-            step += 1
-            target_size = size_(step)
+            target_size = next(size_)
             tol_size = target_size * tolerance
             batch = overflow
             batch_size = overflow_size
@@ -194,8 +191,7 @@ def minibatch_by_words(
             else:
                 if batch:
                     yield batch
-                step += 1
-                target_size = size_(step)
+                target_size = next(size_)
                 tol_size = target_size * tolerance
                 batch = [seq]
                 batch_size = n_words
@@ -232,3 +228,9 @@ def _batch_by_length(
     batches = [list(sorted(batch)) for batch in batches]
     batches.reverse()
     return batches
+
+
+def _schedule_to_sizing(size: SizingSchedule) -> Sizing:
+    if isinstance(size, Schedule):
+        return size.to_generator()
+    return size
diff --git a/spacy/util.py b/spacy/util.py
index 551f78cc969..dedcd17ea58 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1642,12 +1642,12 @@ def minibatch(items, size):
     so that batch-size can vary on each step.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_ = itertools.repeat(size)
     else:
-        size_ = size
+        size_ = iter(size)
     items = iter(items)
-    for step in itertools.count():
-        batch_size = size_(step)
+    while True:
+        batch_size = next(size_)
         batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 77216924405..8555d64ba63 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -878,14 +878,14 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
 > get_length = null
 > ```
 
-| Name               | Description                                                                                                                                                                             |
-| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `seqs`             | The sequences to minibatch. ~~Iterable[Any]~~                                                                                                                                           |
-| `size`             | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
-| `tolerance`        | What percentage of the size to allow batches to exceed. ~~float~~                                                                                                                       |
-| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~                                                                                                     |
-| `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                 |
-| **CREATES**        | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                     |
+| Name               | Description                                                                                                                                                                                       |
+| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `seqs`             | The sequences to minibatch. ~~Iterable[Any]~~                                                                                                                                                     |
+| `size`             | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~ |
+| `tolerance`        | What percentage of the size to allow batches to exceed. ~~float~~                                                                                                                                 |
+| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~                                                                                                               |
+| `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                           |
+| **CREATES**        | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                               |
 
 ### spacy.batch_by_sequence.v1 {id="batch_by_sequence",tag="registered function"}
 
@@ -900,11 +900,11 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
 
 Create a batcher that creates batches of the specified size.
 
-| Name         | Description                                                                                                                                                                             |
-| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `size`       | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
-| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                 |
-| **CREATES**  | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                     |
+| Name         | Description                                                                                                                                                                                       |
+| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `size`       | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~ |
+| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                           |
+| **CREATES**  | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                               |
 
 ### spacy.batch_by_padded.v1 {id="batch_by_padded",tag="registered function"}
 
@@ -926,7 +926,7 @@ sequences in the batch.
 
 | Name               | Description                                                                                                                                                                                                                                 |
 | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `size`             | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~                                          |
+| `size`             | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~                                |
 | `buffer`           | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. ~~int~~ |
 | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~                                                                                                                                     |
 | `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                                                                     |
@@ -1528,7 +1528,7 @@ vary on each step.
 | Name       | Description                                      |
 | ---------- | ------------------------------------------------ |
 | `items`    | The items to batch up. ~~Iterable[Any]~~         |
-| `size`     | The batch size(s). ~~Union[int, Sequence[int]]~~ |
+| `size`     | The batch size(s). ~~Union[int, Iterable[int]]~~ |
 | **YIELDS** | The batches.                                     |
 
 ### util.filter_spans {id="util.filter_spans",tag="function",version="2.1.4"}

From 356f8071ff4678265231706e87aa737eb066c483 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 19 Jan 2023 09:25:34 +0100
Subject: [PATCH 261/504] Set version to v4.0.0.dev0 (#12126)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index f5ee66dae6f..1ce8a44c9a4 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,5 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.7.4"
+__version__ = "4.0.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From fa4d24117b37b25326acfe8193e6f939f1f779cf Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 25 Jan 2023 12:50:21 +0900
Subject: [PATCH 262/504] Refactor lexeme mem passing (#12125)

* Don't pass mem pool to new lexeme function

* Remove unused mem from function args

Two methods calling _new_lexeme, get and get_by_orth, took mem arguments
just to call the internal method. That's no longer necessary, so this
cleans it up.

* prettier formatting

* Remove more unused mem args
---
 spacy/lexeme.pyx                    |  2 +-
 spacy/tokenizer.pxd                 | 76 ++++++++---------------------
 spacy/tokenizer.pyx                 | 39 +++++++--------
 spacy/tokens/doc.pyx                |  8 +--
 spacy/tokens/retokenizer.pyx        |  4 +-
 spacy/vocab.pxd                     |  7 ++-
 spacy/vocab.pyx                     | 30 ++++--------
 website/docs/api/cython-classes.mdx | 20 ++++----
 8 files changed, 67 insertions(+), 119 deletions(-)

diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 22d5b4a5c3e..b0c3784d86e 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -57,7 +57,7 @@ cdef class Lexeme:
         """
         self.vocab = vocab
         self.orth = orth
-        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
+        self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
         if self.c.orth != orth:
             raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
 
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index c963dcbcfa4..58d30c3202f 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -31,58 +31,24 @@ cdef class Tokenizer:
 
     cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
     cdef int _apply_special_cases(self, Doc doc) except -1
-    cdef void _filter_special_spans(
-        self,
-        vector[SpanC] &original,
-        vector[SpanC] &filtered,
-        int doc_len,
-    ) nogil
-    cdef object _prepare_special_spans(
-        self,
-        Doc doc,
-        vector[SpanC] &filtered,
-    )
-    cdef int _retokenize_special_spans(
-        self,
-        Doc doc,
-        TokenC* tokens,
-        object span_data,
-    )
-    cdef int _try_specials_and_cache(
-        self,
-        hash_t key,
-        Doc tokens,
-        int* has_special,
-        bint with_special_cases,
-    ) except -1
-    cdef int _tokenize(
-        self,
-        Doc tokens,
-        str span,
-        hash_t key,
-        int* has_special,
-        bint with_special_cases,
-    ) except -1
-    cdef str _split_affixes(
-        self,
-        Pool mem,
-        str string,
-        vector[LexemeC*] *prefixes,
-        vector[LexemeC*] *suffixes, int* has_special,
-        bint with_special_cases,
-    )
-    cdef int _attach_tokens(
-        self,
-        Doc tokens,
-        str string,
-        vector[LexemeC*] *prefixes,
-        vector[LexemeC*] *suffixes, int* has_special,
-        bint with_special_cases,
-    ) except -1
-    cdef int _save_cached(
-        self,
-        const TokenC* tokens,
-        hash_t key,
-        int* has_special,
-        int n,
-    ) except -1
+    cdef void _filter_special_spans(self, vector[SpanC] &original,
+                            vector[SpanC] &filtered, int doc_len) nogil
+    cdef object _prepare_special_spans(self, Doc doc,
+                                       vector[SpanC] &filtered)
+    cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
+                                       object span_data)
+    cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
+                                     int* has_special,
+                                     bint with_special_cases) except -1
+    cdef int _tokenize(self, Doc tokens, str span, hash_t key,
+                       int* has_special, bint with_special_cases) except -1
+    cdef str _split_affixes(self, str string,
+                                vector[LexemeC*] *prefixes,
+                                vector[LexemeC*] *suffixes, int* has_special,
+                                bint with_special_cases)
+    cdef int _attach_tokens(self, Doc tokens, str string,
+                            vector[LexemeC*] *prefixes,
+                            vector[LexemeC*] *suffixes, int* has_special,
+                            bint with_special_cases) except -1
+    cdef int _save_cached(self, const TokenC* tokens, hash_t key,
+                          int* has_special, int n) except -1
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 7c81d936314..6b157d599f1 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -410,22 +410,19 @@ cdef class Tokenizer:
         cdef vector[LexemeC*] suffixes
         cdef int orig_size
         orig_size = tokens.length
-        span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
+        span = self._split_affixes(span, &prefixes, &suffixes,
                                    has_special, with_special_cases)
         self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
                             with_special_cases)
         self._save_cached(&tokens.c[orig_size], orig_key, has_special,
                           tokens.length - orig_size)
 
-    cdef str _split_affixes(
-        self,
-        Pool mem,
-        str string,
-        vector[const LexemeC*] *prefixes,
-        vector[const LexemeC*] *suffixes,
-        int* has_special,
-        bint with_special_cases
-    ):
+    cdef str _split_affixes(self, str string,
+                                vector[const LexemeC*] *prefixes,
+                                vector[const LexemeC*] *suffixes,
+                                int* has_special,
+                                bint with_special_cases):
+        cdef size_t i
         cdef str prefix
         cdef str suffix
         cdef str minus_pre
@@ -443,7 +440,7 @@ cdef class Tokenizer:
                 minus_pre = string[pre_len:]
                 if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
                     string = minus_pre
-                    prefixes.push_back(self.vocab.get(mem, prefix))
+                    prefixes.push_back(self.vocab.get(prefix))
                     break
             suf_len = self.find_suffix(string[pre_len:])
             if suf_len != 0:
@@ -451,18 +448,18 @@ cdef class Tokenizer:
                 minus_suf = string[:-suf_len]
                 if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
                     string = minus_suf
-                    suffixes.push_back(self.vocab.get(mem, suffix))
+                    suffixes.push_back(self.vocab.get(suffix))
                     break
             if pre_len and suf_len and (pre_len + suf_len) <= len(string):
                 string = string[pre_len:-suf_len]
-                prefixes.push_back(self.vocab.get(mem, prefix))
-                suffixes.push_back(self.vocab.get(mem, suffix))
+                prefixes.push_back(self.vocab.get(prefix))
+                suffixes.push_back(self.vocab.get(suffix))
             elif pre_len:
                 string = minus_pre
-                prefixes.push_back(self.vocab.get(mem, prefix))
+                prefixes.push_back(self.vocab.get(prefix))
             elif suf_len:
                 string = minus_suf
-                suffixes.push_back(self.vocab.get(mem, suffix))
+                suffixes.push_back(self.vocab.get(suffix))
         return string
 
     cdef int _attach_tokens(self, Doc tokens, str string,
@@ -487,11 +484,11 @@ cdef class Tokenizer:
                 # We're always saying 'no' to spaces here -- the caller will
                 # fix up the outermost one, with reference to the original.
                 # See Issue #859
-                tokens.push_back(self.vocab.get(tokens.mem, string), False)
+                tokens.push_back(self.vocab.get(string), False)
             else:
                 matches = self.find_infix(string)
                 if not matches:
-                    tokens.push_back(self.vocab.get(tokens.mem, string), False)
+                    tokens.push_back(self.vocab.get(string), False)
                 else:
                     # Let's say we have dyn-o-mite-dave - the regex finds the
                     # start and end positions of the hyphens
@@ -506,7 +503,7 @@ cdef class Tokenizer:
 
                         if infix_start != start:
                             span = string[start:infix_start]
-                            tokens.push_back(self.vocab.get(tokens.mem, span), False)
+                            tokens.push_back(self.vocab.get(span), False)
 
                         if infix_start != infix_end:
                             # If infix_start != infix_end, it means the infix
@@ -514,11 +511,11 @@ cdef class Tokenizer:
                             # for tokenization in some languages (see
                             # https://github.com/explosion/spaCy/issues/768)
                             infix_span = string[infix_start:infix_end]
-                            tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
+                            tokens.push_back(self.vocab.get(infix_span), False)
                         start = infix_end
                     span = string[start:]
                     if span:
-                        tokens.push_back(self.vocab.get(tokens.mem, span), False)
+                        tokens.push_back(self.vocab.get(span), False)
         cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
         while it != suffixes.rend():
             lexeme = deref(it)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 97b3f800464..2be827b61c7 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -302,12 +302,12 @@ cdef class Doc:
         cdef const LexemeC* lexeme
         for word, has_space in zip(words, spaces):
             if isinstance(word, str):
-                lexeme = self.vocab.get(self.mem, word)
+                lexeme = self.vocab.get(word)
             elif isinstance(word, bytes):
                 raise ValueError(Errors.E028.format(value=word))
             else:
                 try:
-                    lexeme = self.vocab.get_by_orth(self.mem, word)
+                    lexeme = self.vocab.get_by_orth(word)
                 except TypeError:
                     raise TypeError(Errors.E1022.format(wtype=type(word)))
             self.push_back(lexeme, has_space)
@@ -1478,7 +1478,7 @@ cdef class Doc:
             end = start + attrs[i, 0]
             has_space = attrs[i, 1]
             orth_ = text[start:end]
-            lex = self.vocab.get(self.mem, orth_)
+            lex = self.vocab.get(orth_)
             self.push_back(lex, has_space)
             start = end + has_space
         self.from_array(msg["array_head"][2:], attrs[:, 2:])
@@ -1583,7 +1583,7 @@ cdef class Doc:
         assert words == reconstructed_words
 
         for word, has_space in zip(words, spaces):
-            lex = self.vocab.get(self.mem, word)
+            lex = self.vocab.get(word)
             self.push_back(lex, has_space)
 
         # Set remaining token-level attributes via Doc.from_array().
diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx
index d3e9c5674cc..c0052ca9a9a 100644
--- a/spacy/tokens/retokenizer.pyx
+++ b/spacy/tokens/retokenizer.pyx
@@ -220,7 +220,7 @@ def _merge(Doc doc, merges):
             if doc.vocab.vectors_length > 0:
                 doc.vocab.set_vector(new_orth, span.vector)
         token = tokens[token_index]
-        lex = doc.vocab.get(doc.mem, new_orth)
+        lex = doc.vocab.get(new_orth)
         token.lex = lex
         # We set trailing space here too
         token.spacy = doc.c[spans[token_index].end-1].spacy
@@ -360,7 +360,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
     cdef int idx_offset = 0
     for i, orth in enumerate(orths):
         token = &doc.c[token_index + i]
-        lex = doc.vocab.get(doc.mem, orth)
+        lex = doc.vocab.get(orth)
         token.lex = lex
         # If lemma is currently set, set default lemma to orth
         if token.lemma != 0:
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index b91ce3ab45b..f9e01b186b3 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -35,12 +35,11 @@ cdef class Vocab:
     cdef public object lex_attr_getters
     cdef public object cfg
 
-    cdef const LexemeC* get(self, Pool mem, str string) except NULL
-    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
+    cdef const LexemeC* get(self, str string) except NULL
+    cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL
     cdef const TokenC* make_fused_token(self, substrings) except NULL
 
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
+    cdef const LexemeC* _new_lexeme(self, str string) except NULL
     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
 
     cdef PreshMap _by_orth
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 0a8b390ffa9..dea3a696e10 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -146,7 +146,7 @@ cdef class Vocab:
         self.lex_attr_getters[flag_id] = flag_getter
         return flag_id
 
-    cdef const LexemeC* get(self, Pool mem, str string) except NULL:
+    cdef const LexemeC* get(self, str string) except NULL:
         """Get a pointer to a `LexemeC` from the lexicon, creating a new
         `Lexeme` if necessary using memory acquired from the given pool. If the
         pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@@ -163,9 +163,9 @@ cdef class Vocab:
                                                   orth=key, orth_id=string))
             return lex
         else:
-            return self._new_lexeme(mem, string)
+            return self._new_lexeme(string)
 
-    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
+    cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL:
         """Get a pointer to a `LexemeC` from the lexicon, creating a new
         `Lexeme` if necessary using memory acquired from the given pool. If the
         pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@@ -177,21 +177,10 @@ cdef class Vocab:
         if lex != NULL:
             return lex
         else:
-            return self._new_lexeme(mem, self.strings[orth])
-
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
-        # I think this heuristic is bad, and the Vocab should always
-        # own the lexemes. It avoids weird bugs this way, as it's how the thing
-        # was originally supposed to work. The best solution to the growing
-        # memory use is to periodically reset the vocab, which is an action
-        # that should be up to the user to do (so we don't need to keep track
-        # of the doc ownership).
-        # TODO: Change the C API so that the mem isn't passed in here.
-        mem = self.mem
-        # if len(string) < 3 or self.length < 10000:
-        #    mem = self.mem
-        cdef bint is_oov = mem is not self.mem
-        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
+            return self._new_lexeme(self.strings[orth])
+
+    cdef const LexemeC* _new_lexeme(self, str string) except NULL:
+        lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
         lex.orth = self.strings.add(string)
         lex.length = len(string)
         if self.vectors is not None and hasattr(self.vectors, "key2row"):
@@ -205,8 +194,7 @@ cdef class Vocab:
                     value = self.strings.add(value)
                 if value is not None:
                     Lexeme.set_struct_attr(lex, attr, value)
-        if not is_oov:
-            self._add_lex_to_vocab(lex.orth, lex)
+        self._add_lex_to_vocab(lex.orth, lex)
         if lex == NULL:
             raise ValueError(Errors.E085.format(string=string))
         return lex
@@ -277,7 +265,7 @@ cdef class Vocab:
             props = intify_attrs(props, strings_map=self.strings)
             token = &tokens[i]
             # Set the special tokens up to have arbitrary attributes
-            lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
+            lex = <LexemeC*>self.get_by_orth(props[ORTH])
             token.lex = lex
             for attr_id, value in props.items():
                 Token.set_struct_attr(token, attr_id, value)
diff --git a/website/docs/api/cython-classes.mdx b/website/docs/api/cython-classes.mdx
index ce7c03940ac..88bd92c723b 100644
--- a/website/docs/api/cython-classes.mdx
+++ b/website/docs/api/cython-classes.mdx
@@ -163,14 +163,13 @@ vocabulary.
 > #### Example
 >
 > ```python
-> lexeme = vocab.get(vocab.mem, "hello")
+> lexeme = vocab.get("hello")
 > ```
 
-| Name        | Description                                                                                                |
-| ----------- | ---------------------------------------------------------------------------------------------------------- |
-| `mem`       | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
-| `string`    | The string of the word to look up. ~~str~~                                                                 |
-| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~                                                          |
+| Name        | Description                                       |
+| ----------- | ------------------------------------------------- |
+| `string`    | The string of the word to look up. ~~str~~        |
+| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
 
 ### Vocab.get_by_orth {id="vocab_get_by_orth",tag="method"}
 
@@ -183,11 +182,10 @@ vocabulary.
 > lexeme = vocab.get_by_orth(doc[0].lex.norm)
 > ```
 
-| Name        | Description                                                                                                |
-| ----------- | ---------------------------------------------------------------------------------------------------------- |
-| `mem`       | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
-| `orth`      | ID of the verbatim text content. ~~attr_t (uint64_t)~~                                                     |
-| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~                                                          |
+| Name        | Description                                            |
+| ----------- | ------------------------------------------------------ |
+| `orth`      | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
+| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~      |
 
 ## StringStore {id="stringstore",tag="cdef class",source="spacy/strings.pxd"}
 

From d9a682880dd60b0c6ed77a1b2e6359bfe97dcd03 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 08:29:46 +0100
Subject: [PATCH 263/504] Format

---
 spacy/pipeline/edit_tree_lemmatizer.py |  2 +-
 spacy/pipeline/entity_linker.py        | 12 ++++++++++--
 spacy/pipeline/ner.py                  |  7 +++++--
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index d5169178b8c..a1bcb98455c 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -167,7 +167,7 @@ def get_teacher_student_loss(
         student_scores: Scores representing the student model's predictions.
 
         RETURNS (Tuple[float, float]): The loss and the gradient.
-        
+
         DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
         """
         loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 0f15ef38d45..6d041b7de3d 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -478,7 +478,11 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         docs_ents: List[Ragged] = []
         docs_scores: List[Ragged] = []
         if not docs:
-            return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
+            return {
+                KNOWLEDGE_BASE_IDS: final_kb_ids,
+                "ents": docs_ents,
+                "scores": docs_scores,
+            }
         if isinstance(docs, Doc):
             docs = [docs]
         for doc in docs:
@@ -580,7 +584,11 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                 method="predict", msg="result variables not of equal length"
             )
             raise RuntimeError(err)
-        return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
+        return {
+            KNOWLEDGE_BASE_IDS: final_kb_ids,
+            "ents": docs_ents,
+            "scores": docs_scores,
+        }
 
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of documents, using pre-computed scores.
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.py
index 4c2a3ac093c..2c5fd89cc5d 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.py
@@ -260,8 +260,11 @@ def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
     def labels(self):
         # Get the labels from the model by looking at the available moves, e.g.
         # B-PERSON, I-PERSON, L-PERSON, U-PERSON
-        labels = set(remove_bilu_prefix(move) for move in self.move_names
-                     if move[0] in ("B", "I", "L", "U"))
+        labels = set(
+            remove_bilu_prefix(move)
+            for move in self.move_names
+            if move[0] in ("B", "I", "L", "U")
+        )
         return tuple(sorted(labels))
 
     def scored_ents(self, beams):

From bc8bd6f988fa747ff3063812427bed20c38884b4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 08:37:02 +0100
Subject: [PATCH 264/504] CI: Skip tests that require published pipelines

---
 .github/azure-steps.yml | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index c7722391fec..fc83d4994b4 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -64,12 +64,17 @@ steps:
     displayName: "Run GPU tests"
     condition: eq(${{ parameters.gpu }}, true)
 
-  - script: |
-      python -m spacy download ca_core_news_sm
-      python -m spacy download ca_core_news_md
-      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-    displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.8')
+#  - script: |
+#      python -m spacy download ca_core_news_sm
+#      python -m spacy download ca_core_news_md
+#      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+#    displayName: 'Test download CLI'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+#    displayName: 'Test no warnings on load (#11713)'
+#    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@@ -93,17 +98,17 @@ steps:
     displayName: 'Test train CLI'
     condition: eq(variables['python_version'], '3.8')
 
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-    displayName: 'Test assemble CLI'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-    displayName: 'Test assemble CLI vectors warning'
-    condition: eq(variables['python_version'], '3.8')
+#  - script: |
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+#      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+#    displayName: 'Test assemble CLI'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+#      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+#    displayName: 'Test assemble CLI vectors warning'
+#    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python .github/validate_universe_json.py website/meta/universe.json

From 213371b7ac7a96e654c9d9b73d243b34c8d61af2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 15:48:20 +0100
Subject: [PATCH 265/504] Drop python 3.6/3.7, remove unneeded compat (#12187)

* Drop python 3.6/3.7, remove unneeded compat

* Remove unused import

* Minimal python 3.8+ docs updates
---
 .pre-commit-config.yaml         |  2 +-
 CONTRIBUTING.md                 |  2 +-
 README.md                       |  2 +-
 azure-pipelines.yml             | 20 +----------
 requirements.txt                |  5 ++-
 setup.cfg                       |  6 ++--
 spacy/cli/_util.py              | 10 ++++++
 spacy/cli/debug_data.py         |  8 +++++
 spacy/compat.py                 | 13 -------
 spacy/errors.py                 |  3 +-
 spacy/language.py               | 61 +++++++++++++++------------------
 spacy/matcher/matcher.pyi       | 17 ++-------
 spacy/matcher/phrasematcher.pyi |  7 ++--
 spacy/ml/models/parser.py       |  5 +--
 spacy/pipeline/spancat.py       |  9 +++--
 spacy/schemas.py                |  9 +++++
 spacy/ty.py                     | 16 ++-------
 spacy/util.py                   | 11 +++---
 website/docs/usage/index.mdx    |  2 +-
 19 files changed, 87 insertions(+), 121 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e2c5e98fd97..8efe733f904 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
     rev: 22.3.0
     hooks:
     - id: black
-      language_version: python3.7
+      language_version: python3.8
       additional_dependencies: ['click==8.0.4']
 -   repo: https://github.com/pycqa/flake8
     rev: 5.0.4
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ed75e1fd8bd..b85ea8fcc4d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -276,7 +276,7 @@ except:  # noqa: E722
 
 ### Python conventions
 
-All Python code must be written **compatible with Python 3.6+**. More detailed
+All Python code must be written **compatible with Python 3.8+**. More detailed
 code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md).
 
 #### I/O and handling paths
diff --git a/README.md b/README.md
index afa96363b65..9e5c4be6898 100644
--- a/README.md
+++ b/README.md
@@ -115,7 +115,7 @@ For detailed installation instructions, see the
 
 - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
   Studio)
-- **Python version**: Python 3.7+ (only 64 bit)
+- **Python version**: Python 3.8+ (only 64 bit)
 - **Package managers**: [pip] · [conda] (via `conda-forge`)
 
 [pip]: https://pypi.org/project/spacy/
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 0f7ea91f96f..99f1b8afffe 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -29,7 +29,7 @@ jobs:
     steps:
       - task: UsePythonVersion@0
         inputs:
-          versionSpec: "3.7"
+          versionSpec: "3.8"
       - script: |
           pip install flake8==5.0.4
           python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
@@ -40,24 +40,6 @@ jobs:
     strategy:
       matrix:
         # We're only running one platform per Python version to speed up builds
-        Python36Linux:
-          imageName: "ubuntu-20.04"
-          python.version: "3.6"
-        #        Python36Windows:
-        #          imageName: "windows-latest"
-        #          python.version: "3.6"
-        #        Python36Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.6"
-        #        Python37Linux:
-        #          imageName: "ubuntu-20.04"
-        #          python.version: "3.7"
-        Python37Windows:
-          imageName: "windows-latest"
-          python.version: "3.7"
-        #        Python37Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.7"
         #        Python38Linux:
         #          imageName: "ubuntu-latest"
         #          python.version: "3.8"
diff --git a/requirements.txt b/requirements.txt
index c2e3512e898..a68c159d643 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=3.0.11,<3.1.0
+spacy-legacy>=4.0.0.dev0,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
@@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
-typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
@@ -31,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
+mypy>=0.990,<0.1000; platform_machine != "aarch64"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
diff --git a/setup.cfg b/setup.cfg
index b4f5cbefc17..3a84f37d3bf 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -17,7 +17,6 @@ classifiers =
     Operating System :: Microsoft :: Windows
     Programming Language :: Cython
     Programming Language :: Python :: 3
-    Programming Language :: Python :: 3.7
     Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9
     Programming Language :: Python :: 3.10
@@ -30,10 +29,10 @@ project_urls =
 [options]
 zip_safe = false
 include_package_data = true
-python_requires = >=3.6
+python_requires = >=3.8
 install_requires =
     # Our libraries
-    spacy-legacy>=3.0.11,<3.1.0
+    spacy-legacy>=4.0.0.dev0,<4.1.0
     spacy-loggers>=1.0.0,<2.0.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
@@ -54,7 +53,6 @@ install_requires =
     # Official Python utilities
     setuptools
     packaging>=20.0
-    typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
     langcodes>=3.2.0,<4.0.0
 
 [options.entry_points]
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index fa41e6a08e0..ea91e64247d 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,3 +1,10 @@
+from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
+from typing import TYPE_CHECKING, overload
+import sys
+import shutil
+from pathlib import Path
+from wasabi import msg, Printer
+import srsly
 import hashlib
 import os
 import shutil
@@ -27,6 +34,9 @@
 from wasabi import Printer, msg
 from weasel import app as project_cli
 
+from ..schemas import ProjectConfigSchema, validate
+from ..util import import_file, run_command, make_tempdir, registry, logger
+from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
 from .. import about
 from ..compat import Literal
 from ..schemas import validate
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index af3c24f3ba9..c2253b0cb70 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,3 +1,11 @@
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
+from typing import Literal, cast, overload
+from pathlib import Path
+from collections import Counter
+import sys
+import srsly
+from wasabi import Printer, MESSAGES, msg
+import typer
 import math
 import sys
 from collections import Counter
diff --git a/spacy/compat.py b/spacy/compat.py
index 522fa30ddde..1e63807a0e8 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,19 +23,6 @@
 except ImportError:
     cupy = None
 
-if sys.version_info[:2] >= (3, 8):  # Python 3.8+
-    from typing import Literal, Protocol, runtime_checkable
-else:
-    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
-
-# Important note: The importlib_metadata "backport" includes functionality
-# that's not part of the built-in importlib.metadata. We should treat this
-# import like the built-in and only use what's available there.
-try:  # Python 3.8+
-    import importlib.metadata as importlib_metadata
-except ImportError:
-    from catalogue import _importlib_metadata as importlib_metadata  # type: ignore[no-redef]    # noqa: F401
-
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/errors.py b/spacy/errors.py
index 9074a3fead8..dcf8e60b7a1 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,7 +1,6 @@
+from typing import Literal
 import warnings
 
-from .compat import Literal
-
 
 class ErrorsWithCodes(type):
     def __getattribute__(self, code):
diff --git a/spacy/language.py b/spacy/language.py
index a47cc5df454..161d5b64884 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,3 +1,10 @@
+from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Literal
+from typing import Union, Tuple, List, Set, Pattern, Sequence
+from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
+
+from dataclasses import dataclass
+import random
+import itertools
 import functools
 import inspect
 import itertools
@@ -30,43 +37,29 @@
     overload,
 )
 
-import srsly
-from thinc.api import Config, CupyOps, Optimizer, get_current_ops
-
-from . import about, ty, util
-from .compat import Literal
+from . import ty
+from .tokens.underscore import Underscore
+from .vocab import Vocab, create_vocab
+from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
+from .training import Example, validate_examples
+from .training.initialize import init_vocab, init_tok2vec
+from .scorer import Scorer
+from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
+from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
+from .util import warn_if_jupyter_cupy
+from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
+from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .lang.punctuation import TOKENIZER_INFIXES
+from .tokens import Doc
+from .tokenizer import Tokenizer
 from .errors import Errors, Warnings
+from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
+from .schemas import ConfigSchemaPretrain, validate_init_settings
 from .git_info import GIT_VERSION
-from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
-from .lang.tokenizer_exceptions import BASE_EXCEPTIONS, URL_MATCH
+from . import util
+from . import about
 from .lookups import load_lookups
-from .pipe_analysis import analyze_pipes, print_pipe_analysis, validate_attrs
-from .schemas import (
-    ConfigSchema,
-    ConfigSchemaInit,
-    ConfigSchemaNlp,
-    ConfigSchemaPretrain,
-    validate_init_settings,
-)
-from .scorer import Scorer
-from .tokenizer import Tokenizer
-from .tokens import Doc
-from .tokens.underscore import Underscore
-from .training import Example, validate_examples
-from .training.initialize import init_tok2vec, init_vocab
-from .util import (
-    _DEFAULT_EMPTY_PIPES,
-    CONFIG_SECTION_ORDER,
-    SimpleFrozenDict,
-    SimpleFrozenList,
-    _pipe,
-    combine_score_weights,
-    raise_error,
-    registry,
-    warn_if_jupyter_cupy,
-)
-from .vectors import BaseVectors
-from .vocab import Vocab, create_vocab
+
 
 PipeCallable = Callable[[Doc], Doc]
 
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index c33b534cbd2..a0b6d91e7d5 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -1,17 +1,6 @@
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    Iterator,
-    List,
-    Optional,
-    Tuple,
-    Union,
-    overload,
-)
-
-from ..compat import Literal
+from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Literal
+from typing import Iterator, Iterable, overload
+from ..vocab import Vocab
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index f9585da7893..45685db228a 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -1,6 +1,7 @@
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, overload
-
-from ..compat import Literal
+from typing import List, Tuple, Union, Optional, Callable, Any, Dict, Literal
+from typing import overload
+from .matcher import Matcher
+from ..vocab import Vocab
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 from .matcher import Matcher
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 59483839206..01312983d86 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,12 +1,9 @@
-from typing import Optional, List, Tuple, Any
+from typing import Optional, List, Tuple, Any, Literal
 from thinc.types import Floats2d
 from thinc.api import Model
 import warnings
 
 from ...errors import Errors, Warnings
-from ...compat import Literal
-from ...errors import Errors
-from ...tokens import Doc
 from ...util import registry
 from ..tb_framework import TransitionModel
 from ...tokens.doc import Doc
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 1450bb5d6cb..bfaaf82e8d0 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,5 +1,5 @@
 from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
-from typing import Union
+from typing import Union, Protocol, runtime_checkable
 from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
 from thinc.api import Optimizer
 from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
@@ -8,7 +8,12 @@
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
 from thinc.types import Floats2d, Ints1d, Ints2d, Ragged
 
-from ..compat import Protocol, runtime_checkable
+from ..scorer import Scorer
+from ..language import Language
+from .trainable_pipe import TrainablePipe
+from ..tokens import Doc, SpanGroup, Span
+from ..vocab import Vocab
+from ..training import Example, validate_examples
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 9a2b5ed60e9..831f7df058f 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,3 +1,12 @@
+from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
+from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
+from enum import Enum
+from pydantic import BaseModel, Field, ValidationError, validator, create_model
+from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
+from pydantic.main import ModelMetaclass
+from thinc.api import Optimizer, ConfigValidationError, Model
+from thinc.config import Promise
+from collections import defaultdict
 import inspect
 import re
 from collections import defaultdict
diff --git a/spacy/ty.py b/spacy/ty.py
index f389456c03e..5a2b44aa583 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -1,17 +1,5 @@
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Sequence,
-)
-
-from thinc.api import Model, Optimizer
-
-from .compat import Protocol, runtime_checkable
+from typing import TYPE_CHECKING, Protocol, runtime_checkable
+from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
 
 if TYPE_CHECKING:
     from .language import Language
diff --git a/spacy/util.py b/spacy/util.py
index dedcd17ea58..de04ee6e718 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1,5 +1,6 @@
 import functools
 import importlib
+import importlib.metadata
 import importlib.util
 import re
 from pathlib import Path
@@ -70,7 +71,7 @@
 
 
 from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows, importlib_metadata
+from .compat import cupy, CudaStream, is_windows
 from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, importlib_metadata, is_windows
@@ -748,8 +749,8 @@ def get_package_version(name: str) -> Optional[str]:
     RETURNS (str / None): The version or None if package not installed.
     """
     try:
-        return importlib_metadata.version(name)  # type: ignore[attr-defined]
-    except importlib_metadata.PackageNotFoundError:  # type: ignore[attr-defined]
+        return importlib.metadata.version(name)  # type: ignore[attr-defined]
+    except importlib.metadata.PackageNotFoundError:  # type: ignore[attr-defined]
         return None
 
 
@@ -937,7 +938,7 @@ def is_package(name: str) -> bool:
     RETURNS (bool): True if installed package, False if not.
     """
     try:
-        importlib_metadata.distribution(name)  # type: ignore[attr-defined]
+        importlib.metadata.distribution(name)  # type: ignore[attr-defined]
         return True
     except:  # noqa: E722
         return False
@@ -1777,7 +1778,7 @@ def packages_distributions() -> Dict[str, List[str]]:
     it's not available in the builtin importlib.metadata.
     """
     pkg_to_dist = defaultdict(list)
-    for dist in importlib_metadata.distributions():
+    for dist in importlib.metadata.distributions():
         for pkg in (dist.read_text("top_level.txt") or "").split():
             pkg_to_dist[pkg].append(dist.metadata["Name"])
     return dict(pkg_to_dist)
diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx
index c50e9db6c6b..b8b4917f2b2 100644
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@@ -20,7 +20,7 @@ menu:
 
 ## Installation instructions {id="installation"}
 
-spaCy is compatible with **64-bit CPython 3.7+** and runs on **Unix/Linux**,
+spaCy is compatible with **64-bit CPython 3.8+** and runs on **Unix/Linux**,
 **macOS/OS X** and **Windows**. The latest spaCy releases are available over
 [pip](https://pypi.python.org/pypi/spacy) and
 [conda](https://anaconda.org/conda-forge/spacy).

From 6a52354b4bb5882d63dff33503165f61fe77a5ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 30 Jan 2023 12:44:11 +0100
Subject: [PATCH 266/504] Add `Language.distill` (#12116)

* Add `Language.distill`

This method is the distillation counterpart of `Language.update`.  It
takes a teacher `Language` instance and distills the student pipes on
the teacher pipes.

* Apply suggestions from code review

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Clarify that how Example is used in distillation

* Update transition parser distill docstring for examples argument

* Pass optimizer to `TrainablePipe.distill`

* Annotate pipe before update

As discussed internally, we want to let a pipe annotate before doing an
update with gold/silver data. Otherwise, the output may be (too)
informed by the gold/silver data.

* Rename `component_map` to `student_to_teacher`

* Better synopsis in `Language.distill` docstring

* `name` -> `student_name`

* Fix labels type in docstring

* Mark distill test as slow

* Fix `student_to_teacher` type in docs

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/language.py                       | 108 +++++++++++++++++++++++-
 spacy/pipeline/trainable_pipe.pyx       |   4 +-
 spacy/pipeline/transition_parser.pyx    |   4 +-
 spacy/tests/test_language.py            |  69 +++++++++++++++
 spacy/ty.py                             |  19 +++++
 website/docs/api/dependencyparser.mdx   |  18 ++--
 website/docs/api/edittreelemmatizer.mdx |  18 ++--
 website/docs/api/entityrecognizer.mdx   |  18 ++--
 website/docs/api/language.mdx           |  28 ++++++
 website/docs/api/morphologizer.mdx      |  18 ++--
 website/docs/api/pipe.mdx               |  18 ++--
 website/docs/api/sentencerecognizer.mdx |  18 ++--
 website/docs/api/tagger.mdx             |  18 ++--
 13 files changed, 290 insertions(+), 68 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 161d5b64884..8cd439d10b1 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -41,7 +41,7 @@
 from .tokens.underscore import Underscore
 from .vocab import Vocab, create_vocab
 from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .training import Example, validate_examples
+from .training import Example, validate_examples, validate_distillation_examples
 from .training.initialize import init_vocab, init_tok2vec
 from .scorer import Scorer
 from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
@@ -1049,6 +1049,102 @@ def __call__(
                 raise ValueError(Errors.E005.format(name=name, returned_type=type(doc)))
         return doc
 
+    def distill(
+        self,
+        teacher: "Language",
+        examples: Iterable[Example],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
+        exclude: Iterable[str] = SimpleFrozenList(),
+        annotates: Iterable[str] = SimpleFrozenList(),
+        student_to_teacher: Optional[Dict[str, str]] = None,
+    ):
+        """Distill the models in a student pipeline from a teacher pipeline.
+        teacher (Language): Teacher to distill from.
+        examples (Iterable[Example]): Distillation examples. The reference
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
+        drop (float): The dropout rate.
+        sgd (Optional[Optimizer]): An optimizer.
+        losses (Optional(Dict[str, float])): Dictionary to update with the loss,
+            keyed by component.
+        component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
+            for specific pipeline components, keyed by component name.
+        exclude (Iterable[str]): Names of components that shouldn't be updated.
+        annotates (Iterable[str]): Names of components that should set
+            annotations on the predicted examples after updating.
+        student_to_teacher (Optional[Dict[str, str]]): Map student pipe name to
+            teacher pipe name, only needed for pipes where the student pipe
+            name does not match the teacher pipe name.
+        RETURNS (Dict[str, float]): The updated losses dictionary
+
+        DOCS: https://spacy.io/api/language#distill
+        """
+        if student_to_teacher is None:
+            student_to_teacher = {}
+        if losses is None:
+            losses = {}
+        if isinstance(examples, list) and len(examples) == 0:
+            return losses
+
+        validate_distillation_examples(examples, "Language.distill")
+        examples = _copy_examples(examples)
+
+        if sgd is None:
+            if self._optimizer is None:
+                self._optimizer = self.create_optimizer()
+            sgd = self._optimizer
+
+        if component_cfg is None:
+            component_cfg = {}
+        pipe_kwargs = {}
+        for student_name, student_proc in self.pipeline:
+            component_cfg.setdefault(student_name, {})
+            pipe_kwargs[student_name] = deepcopy(component_cfg[student_name])
+            component_cfg[student_name].setdefault("drop", drop)
+            pipe_kwargs[student_name].setdefault("batch_size", self.batch_size)
+
+        teacher_pipes = dict(teacher.pipeline)
+        for student_name, student_proc in self.pipeline:
+            if student_name in annotates:
+                for doc, eg in zip(
+                    _pipe(
+                        (eg.predicted for eg in examples),
+                        proc=student_proc,
+                        name=student_name,
+                        default_error_handler=self.default_error_handler,
+                        kwargs=pipe_kwargs[student_name],
+                    ),
+                    examples,
+                ):
+                    eg.predicted = doc
+
+            if (
+                student_name not in exclude
+                and isinstance(student_proc, ty.DistillableComponent)
+                and student_proc.is_distillable
+            ):
+                # A missing teacher pipe is not an error, some student pipes
+                # do not need a teacher, such as tok2vec layer losses.
+                teacher_name = (
+                    student_to_teacher[student_name]
+                    if student_name in student_to_teacher
+                    else student_name
+                )
+                teacher_pipe = teacher_pipes.get(teacher_name, None)
+                student_proc.distill(
+                    teacher_pipe,
+                    examples,
+                    sgd=sgd,
+                    losses=losses,
+                    **component_cfg[student_name],
+                )
+
+        return losses
+
     def disable_pipes(self, *names) -> "DisabledPipes":
         """Disable one or more pipeline components. If used as a context
         manager, the pipeline will be restored to the initial state at the end
@@ -1274,12 +1370,16 @@ def initialize(
         self,
         get_examples: Optional[Callable[[], Iterable[Example]]] = None,
         *,
+        labels: Optional[Dict[str, Any]] = None,
         sgd: Optional[Optimizer] = None,
     ) -> Optimizer:
         """Initialize the pipe for training, using data examples if available.
 
         get_examples (Callable[[], Iterable[Example]]): Optional function that
             returns gold-standard Example objects.
+        labels (Optional[Dict[str, Any]]): Labels to pass to pipe initialization,
+            using the names of the pipes as keys. Overrides labels that are in
+            the model configuration.
         sgd (Optional[Optimizer]): An optimizer to use for updates. If not
             provided, will be created using the .create_optimizer() method.
         RETURNS (thinc.api.Optimizer): The optimizer.
@@ -1327,6 +1427,8 @@ def get_examples():
         for name, proc in self.pipeline:
             if isinstance(proc, ty.InitializableComponent):
                 p_settings = I["components"].get(name, {})
+                if labels is not None and name in labels:
+                    p_settings["labels"] = labels[name]
                 p_settings = validate_init_settings(
                     proc.initialize, p_settings, section="components", name=name
                 )
@@ -1800,6 +1902,7 @@ def from_config(
         # using the nlp.config with all defaults.
         config = util.copy_config(config)
         orig_pipeline = config.pop("components", {})
+        orig_distill = config.pop("distill", None)
         orig_pretraining = config.pop("pretraining", None)
         config["components"] = {}
         if auto_fill:
@@ -1808,6 +1911,9 @@ def from_config(
             filled = config
         filled["components"] = orig_pipeline
         config["components"] = orig_pipeline
+        if orig_distill is not None:
+            filled["distill"] = orig_distill
+            config["distill"] = orig_distill
         if orig_pretraining is not None:
             filled["pretraining"] = orig_pretraining
             config["pretraining"] = orig_pretraining
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 3ec3e7551aa..97442a1aa97 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -74,8 +74,8 @@ cdef class TrainablePipe(Pipe):
         teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
             from.
         examples (Iterable[Example]): Distillation examples. The reference
-            and predicted docs must have the same number of tokens and the
-            same orthography.
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
         drop (float): dropout rate.
         sgd (Optional[Optimizer]): An optimizer. Will be created via
             create_optimizer if not set.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index d71a4ab0355..6a50dbacaeb 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -233,8 +233,8 @@ class Parser(TrainablePipe):
         teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
             from.
         examples (Iterable[Example]): Distillation examples. The reference
-            and predicted docs must have the same number of tokens and the
-            same orthography.
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
         drop (float): dropout rate.
         sgd (Optional[Optimizer]): An optimizer. Will be created via
             create_optimizer if not set.
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index d229739e1ee..8138cb157d2 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -28,6 +28,12 @@
     pass
 
 
+TAGGER_TRAIN_DATA = [
+    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
+
 def evil_component(doc):
     if "2" in doc.text:
         raise ValueError("no dice")
@@ -805,3 +811,66 @@ def bad_pipe(doc):
     nlp.add_pipe("test_component_bad_pipe")
     with pytest.raises(ValueError, match="instead of a Doc"):
         nlp("text")
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("teacher_tagger_name", ["tagger", "teacher_tagger"])
+def test_distill(teacher_tagger_name):
+    teacher = English()
+    teacher_tagger = teacher.add_pipe("tagger", name=teacher_tagger_name)
+    train_examples = []
+    for t in TAGGER_TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses[teacher_tagger_name] < 0.00001
+
+    student = English()
+    student_tagger = student.add_pipe("tagger")
+    student_tagger.min_tree_freq = 1
+    student_tagger.initialize(
+        get_examples=lambda: train_examples, labels=teacher_tagger.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TAGGER_TRAIN_DATA
+    ]
+
+    student_to_teacher = (
+        None
+        if teacher_tagger.name == student_tagger.name
+        else {student_tagger.name: teacher_tagger.name}
+    )
+
+    for i in range(50):
+        losses = {}
+        student.distill(
+            teacher,
+            distill_examples,
+            sgd=optimizer,
+            losses=losses,
+            student_to_teacher=student_to_teacher,
+        )
+    assert losses["tagger"] < 0.00001
+
+    test_text = "I like blue eggs"
+    doc = student(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+    # Do an extra update to check if annotates works, though we can't really
+    # validate the resuls, since the annotations are ephemeral.
+    student.distill(
+        teacher,
+        distill_examples,
+        sgd=optimizer,
+        losses=losses,
+        student_to_teacher=student_to_teacher,
+        annotates=["tagger"],
+    )
diff --git a/spacy/ty.py b/spacy/ty.py
index 5a2b44aa583..ac09cb336ac 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -25,6 +25,25 @@ def finish_update(self, sgd: Optimizer) -> None:
         ...
 
 
+@runtime_checkable
+class DistillableComponent(Protocol):
+    is_distillable: bool
+
+    def distill(
+        self,
+        teacher_pipe: Optional[TrainableComponent],
+        examples: Iterable["Example"],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None
+    ) -> Dict[str, float]:
+        ...
+
+    def finish_update(self, sgd: Optimizer) -> None:
+        ...
+
+
 @runtime_checkable
 class InitializableComponent(Protocol):
     def initialize(
diff --git a/website/docs/api/dependencyparser.mdx b/website/docs/api/dependencyparser.mdx
index 5179ce48b84..296d6d87da5 100644
--- a/website/docs/api/dependencyparser.mdx
+++ b/website/docs/api/dependencyparser.mdx
@@ -154,15 +154,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## DependencyParser.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx
index 2e099365758..c8b5c71806b 100644
--- a/website/docs/api/edittreelemmatizer.mdx
+++ b/website/docs/api/edittreelemmatizer.mdx
@@ -138,15 +138,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## EditTreeLemmatizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/entityrecognizer.mdx b/website/docs/api/entityrecognizer.mdx
index 005d5d11deb..f503cc998b0 100644
--- a/website/docs/api/entityrecognizer.mdx
+++ b/website/docs/api/entityrecognizer.mdx
@@ -150,15 +150,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## EntityRecognizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index d5fbae05ec4..2a1f7a1a961 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -333,6 +333,34 @@ and custom registered functions if needed. See the
 | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
 | **RETURNS**     | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                          |
 
+## Language.distill {id="distill",tag="method,experimental",version="4"}
+
+Distill the models in a student pipeline from a teacher pipeline.
+
+> #### Example
+>
+> ```python
+>
+> teacher = spacy.load("en_core_web_lg")
+> student = English()
+> student.add_pipe("tagger")
+> student.distill(teacher, examples, sgd=optimizer)
+> ```
+
+| Name                 | Description                                                                                                                                                                                 |
+| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher`            | The teacher pipeline to distill from. ~~Language~~                                                                                                                                          |
+| `examples`           | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_       |                                                                                                                                                                                             |
+| `drop`               | The dropout rate. ~~float~~                                                                                                                                                                 |
+| `sgd`                | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`             | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                                                             |
+| `component_cfg`      | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                              |
+| `exclude`            | Names of components that shouldn't be updated. Defaults to `[]`. ~~Iterable[str]~~                                                                                                          |
+| `annotates`          | Names of components that should set annotations on the prediced examples after updating. Defaults to `[]`. ~~Iterable[str]~~                                                                |
+| `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                              |
+| **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+
 ## Language.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 4f79458d319..4660ec312fa 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -144,15 +144,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## Morphologizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx
index 120c8f6908f..e1e7f5d7021 100644
--- a/website/docs/api/pipe.mdx
+++ b/website/docs/api/pipe.mdx
@@ -257,15 +257,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## TrainablePipe.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx
index 02fd57102e2..dfb7ed308ba 100644
--- a/website/docs/api/sentencerecognizer.mdx
+++ b/website/docs/api/sentencerecognizer.mdx
@@ -129,15 +129,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## SentenceRecognizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index 664fd7940c1..35e7a23b174 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -128,15 +128,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## Tagger.pipe {id="pipe",tag="method"}
 

From 56d7cced647004eb239f13e11dbd532a6a3ba2b8 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 31 Jan 2023 19:31:17 +0900
Subject: [PATCH 267/504] Don't re-download installed models (#12188)

* Don't re-download installed models

When downloading a model, this checks if the same version of the same
model is already installed. If it is then the download is skipped.

This is necessary because pip uses the final download URL for its
caching feature, but because of the way models are hosted on Github,
their URLs change every few minutes.

* Use importlib instead of meta.json

* Use get_package_version

* Add untested, disabled test

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 .github/azure-steps.yml |  5 +++++
 spacy/cli/download.py   | 11 ++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index fc83d4994b4..11dc7e295e4 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -74,6 +74,11 @@ steps:
 #  - script: |
 #      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
 #    displayName: 'Test no warnings on load (#11713)'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping
+#    displayName: 'Test skip re-download (#12188)'
 #    condition: eq(variables['python_version'], '3.8')
 
   - script: |
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index f371d110319..ea7a06c5417 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -8,7 +8,8 @@
 
 from .. import about
 from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version
+from ..util import is_prerelease_version, get_installed_models
+from ..util import get_package_version
 
 
 @app.command(
@@ -71,6 +72,14 @@ def download(
         compatibility = get_compatibility()
         version = get_version(model_name, compatibility)
 
+    # If we already have this version installed, skip downloading
+    installed = get_installed_models()
+    if model_name in installed:
+        installed_version = get_package_version(model_name)
+        if installed_version == version:
+            msg.warn(f"{model_name} v{version} already installed, skipping")
+            return
+
     filename = get_model_filename(model_name, version, sdist)
 
     download_model(filename, pip_args)

From d7054b7127b3bf4de0b18db09860b6738a1508c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 31 Jan 2023 13:06:02 +0100
Subject: [PATCH 268/504] Add the configuration schema for distillation
 (#12201)

* Add the configuration schema for distillation

This also adds the default configuration and some tests. The schema will
be used by the training loop and `distill` subcommand.

* Format

* Change distillation shortopt to -d

* Fix descripion of max_epochs

* Rename distillation flag to -dt

* Rename `pipe_map` to `student_to_teacher`
---
 spacy/cli/init_config.py                      | 15 +++-
 spacy/default_config_distillation.cfg         | 34 ++++++++
 spacy/language.py                             |  3 +
 spacy/schemas.py                              | 23 +++++
 .../tests/serialize/test_serialize_config.py  | 85 +++++++++++++++----
 5 files changed, 143 insertions(+), 17 deletions(-)
 create mode 100644 spacy/default_config_distillation.cfg

diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index a7c03d00f90..129b5a24e84 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -9,7 +9,7 @@
 from wasabi import Printer, diff_strings
 
 from .. import util
-from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
+from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
 from ._util import (
@@ -90,6 +90,7 @@ def init_fill_config_cli(
     # fmt: off
     base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
     output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
+    distillation: bool = Opt(False, "--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"),
     pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
     diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
     code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
@@ -105,13 +106,20 @@ def init_fill_config_cli(
     DOCS: https://spacy.io/api/cli#init-fill-config
     """
     import_code(code_path)
-    fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
+    fill_config(
+        output_file,
+        base_path,
+        distillation=distillation,
+        pretraining=pretraining,
+        diff=diff,
+    )
 
 
 def fill_config(
     output_file: Path,
     base_path: Path,
     *,
+    distillation: bool = False,
     pretraining: bool = False,
     diff: bool = False,
     silent: bool = False,
@@ -130,6 +138,9 @@ def fill_config(
     # replaced with their actual config after loading, so we have to re-add them
     sourced = util.get_sourced_components(config)
     filled["components"].update(sourced)
+    if distillation:
+        distillation_config = util.load_config(DEFAULT_CONFIG_DISTILL_PATH)
+        filled = distillation_config.merge(filled)
     if pretraining:
         validate_config_for_pretrain(filled, msg)
         pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
diff --git a/spacy/default_config_distillation.cfg b/spacy/default_config_distillation.cfg
new file mode 100644
index 00000000000..1926fafa961
--- /dev/null
+++ b/spacy/default_config_distillation.cfg
@@ -0,0 +1,34 @@
+[paths]
+raw_text = null
+
+[distillation]
+corpus = "corpora.distillation"
+dropout = 0.1
+max_epochs = 1
+max_steps = 0
+student_to_teacher = {}
+
+[distillation.batcher]
+@batchers = "spacy.batch_by_words.v1"
+size = 3000
+discard_oversize = false
+tolerance = 0.2
+
+[distillation.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 1e-4
+
+[corpora]
+
+[corpora.distillation]
+@readers = "spacy.PlainTextCorpus.v1"
+path = ${paths.raw_text}
+min_length = 0
+max_length = 0
diff --git a/spacy/language.py b/spacy/language.py
index 8cd439d10b1..a1fa61d0923 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -67,6 +67,9 @@
 # This is the base config will all settings (training etc.)
 DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
 DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
+# This is the base config for the [distillation] block and currently not included
+# in the main config and only added via the 'init fill-config' command
+DEFAULT_CONFIG_DISTILL_PATH = Path(__file__).parent / "default_config_distillation.cfg"
 # This is the base config for the [pretraining] block and currently not included
 # in the main config and only added via the 'init fill-config' command
 DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 831f7df058f..32fb042b5a0 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -470,6 +470,27 @@ class Config:
         arbitrary_types_allowed = True
 
 
+class ConfigSchemaDistillEmpty(BaseModel):
+    class Config:
+        extra = "forbid"
+
+
+class ConfigSchemaDistill(BaseModel):
+    # fmt: off
+    batcher: Batcher = Field(..., title="Batcher for the training data")
+    corpus: StrictStr = Field(..., title="Path in the config to the distillation data")
+    dropout: StrictFloat = Field(..., title="Dropout rate")
+    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to distill for")
+    max_steps: StrictInt = Field(..., title="Maximum number of steps to distill for")
+    optimizer: Optimizer = Field(..., title="The optimizer to use")
+    student_to_teacher: Dict[str, str] = Field(..., title="Mapping from student to teacher pipe")
+    # fmt: on
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
 class ConfigSchema(BaseModel):
     training: ConfigSchemaTraining
     nlp: ConfigSchemaNlp
@@ -477,6 +498,7 @@ class ConfigSchema(BaseModel):
     components: Dict[str, Dict[str, Any]]
     corpora: Dict[str, Reader]
     initialize: ConfigSchemaInit
+    distillation: Union[ConfigSchemaDistill, ConfigSchemaDistillEmpty] = {}  # type: ignore[assignment]
 
     class Config:
         extra = "allow"
@@ -488,6 +510,7 @@ class Config:
     "training": ConfigSchemaTraining,
     "pretraining": ConfigSchemaPretrain,
     "initialize": ConfigSchemaInit,
+    "distill": ConfigSchemaDistill,
 }
 
 # Recommendations for init config workflows
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index dd0a53c910e..eb0dcc1e38c 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -5,21 +5,14 @@
 import spacy
 from spacy.lang.de import German
 from spacy.lang.en import English
-from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH, Language
-from spacy.ml.models import (
-    MaxoutWindowEncoder,
-    MultiHashEmbed,
-    build_tb_parser_model,
-    build_Tok2Vec_model,
-)
-from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
-from spacy.training import Example
-from spacy.util import (
-    load_config,
-    load_config_from_str,
-    load_model_from_config,
-    registry,
-)
+from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
+from spacy.language import DEFAULT_CONFIG_DISTILL_PATH
+from spacy.language import Language
+from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
+from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
+from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
+from spacy.util import load_config, load_config_from_str
+from spacy.util import load_model_from_config, registry
 
 from ..util import make_tempdir
 
@@ -74,6 +67,60 @@
 width = ${components.tok2vec.model.width}
 """
 
+distill_config_string = """
+[paths]
+train = null
+dev = null
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+
+[training]
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+size = 666
+
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 342
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
+subword_features = true
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v2"
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.width}
+
+[distill]
+"""
+
+
 pretrain_config_string = """
 [paths]
 train = null
@@ -209,6 +256,14 @@ def test_create_nlp_from_config():
         load_model_from_config(Config(bad_cfg), auto_fill=True)
 
 
+def test_nlp_from_distillation_config():
+    """Test that the default distillation config validates properly"""
+    config = Config().from_str(distill_config_string)
+    distill_config = load_config(DEFAULT_CONFIG_DISTILL_PATH)
+    filled = config.merge(distill_config)
+    registry.resolve(filled["distillation"], schema=ConfigSchemaDistill)
+
+
 def test_create_nlp_from_pretraining_config():
     """Test that the default pretraining config validates properly"""
     config = Config().from_str(pretrain_config_string)

From 70b912219dd772d3572ea5f59acc7601cde321e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 31 Jan 2023 13:19:42 +0100
Subject: [PATCH 269/504] Language.distill: copy both reference and predicted
 (#12209)

* Language.distill: copy both reference and predicted

In distillation we also modify the teacher docs (e.g. in tok2vec
components), so we need to copy both the reference and predicted doc.

Problem caught by @shadeMe

* Make new `_copy_examples` args kwonly
---
 spacy/language.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index a1fa61d0923..cb9652e97bf 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1094,7 +1094,7 @@ def distill(
             return losses
 
         validate_distillation_examples(examples, "Language.distill")
-        examples = _copy_examples(examples)
+        examples = _copy_examples(examples, copy_x=True, copy_y=True)
 
         if sgd is None:
             if self._optimizer is None:
@@ -2409,13 +2409,18 @@ def restore(self) -> None:
         self[:] = []
 
 
-def _copy_examples(examples: Iterable[Example]) -> List[Example]:
+def _copy_examples(
+    examples: Iterable[Example], *, copy_x: bool = True, copy_y: bool = False
+) -> List[Example]:
     """Make a copy of a batch of examples, copying the predicted Doc as well.
     This is used in contexts where we need to take ownership of the examples
     so that they can be mutated, for instance during Language.evaluate and
     Language.update.
     """
-    return [Example(eg.x.copy(), eg.y) for eg in examples]
+    return [
+        Example(eg.x.copy() if copy_x else eg.x, eg.y.copy() if copy_y else eg.y)
+        for eg in examples
+    ]
 
 
 def _apply_pipes(

From e3ccc59df1f5dc45df500b6bb48ab7463a8c5aae Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Tue, 31 Jan 2023 17:30:43 +0100
Subject: [PATCH 270/504] Rename language codes (Icelandic, multi-language)
 (#12149)

* Init

* fix tests

* Update spacy/errors.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Fix test_blank_languages

* Rename xx to mul in docs

* Format _util with black

* prettier formatting

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/_util.py                            | 153 ++++++++++++++++++
 spacy/cli/convert.py                          |   6 +
 spacy/cli/init_config.py                      |  18 +--
 spacy/cli/init_pipeline.py                    |  19 +--
 spacy/errors.py                               |   1 +
 spacy/lang/{is => isl}/__init__.py            |   2 +-
 spacy/lang/{is => isl}/stop_words.py          |   0
 spacy/lang/{xx => mul}/__init__.py            |   4 +-
 spacy/lang/{xx => mul}/examples.py            |   0
 spacy/scorer.py                               |   2 +-
 spacy/tests/README.md                         |   2 +-
 spacy/tests/conftest.py                       |  10 +-
 spacy/tests/doc/test_doc_api.py               |   2 +-
 spacy/tests/lang/{is => isl}/__init__.py      |   0
 spacy/tests/lang/{is => isl}/test_text.py     |   8 +-
 .../tests/lang/{is => isl}/test_tokenizer.py  |   8 +-
 spacy/tests/lang/{xx => mul}/__init__.py      |   0
 spacy/tests/lang/{xx => mul}/test_text.py     |   4 +-
 .../tests/lang/{xx => mul}/test_tokenizer.py  |   8 +-
 spacy/tests/lang/test_initialize.py           |   6 +-
 spacy/tests/pipeline/test_span_ruler.py       |  52 +++---
 spacy/tests/test_language.py                  |   9 +-
 spacy/tests/tokenizer/test_explain.py         |   1 +
 .../training/converters/conll_ner_to_docs.py  |   4 +-
 spacy/training/converters/json_to_docs.py     |  12 +-
 spacy/util.py                                 |   8 +-
 website/docs/api/scorer.mdx                   |   2 +-
 website/docs/usage/models.mdx                 |  12 +-
 website/meta/languages.json                   |   6 +-
 website/src/widgets/quickstart-models.js      |   2 +-
 30 files changed, 254 insertions(+), 107 deletions(-)
 rename spacy/lang/{is => isl}/__init__.py (93%)
 rename spacy/lang/{is => isl}/stop_words.py (100%)
 rename spacy/lang/{xx => mul}/__init__.py (67%)
 rename spacy/lang/{xx => mul}/examples.py (100%)
 rename spacy/tests/lang/{is => isl}/__init__.py (100%)
 rename spacy/tests/lang/{is => isl}/test_text.py (85%)
 rename spacy/tests/lang/{is => isl}/test_tokenizer.py (72%)
 rename spacy/tests/lang/{xx => mul}/__init__.py (100%)
 rename spacy/tests/lang/{xx => mul}/test_text.py (96%)
 rename spacy/tests/lang/{xx => mul}/test_tokenizer.py (68%)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index ea91e64247d..52a70cc7320 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -37,6 +37,7 @@
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
 from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
+from ..errors import RENAMED_LANGUAGE_CODES
 from .. import about
 from ..compat import Literal
 from ..schemas import validate
@@ -158,6 +159,158 @@ def _parse_override(value: Any) -> Any:
         return str(value)
 
 
+def _handle_renamed_language_codes(lang: Optional[str]) -> None:
+    # Throw error for renamed language codes in v4
+    if lang in RENAMED_LANGUAGE_CODES:
+        msg.fail(
+            title="Renamed language code",
+            text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
+            exits=1,
+        )
+
+
+def load_project_config(
+    path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
+) -> Dict[str, Any]:
+    """Load the project.yml file from a directory and validate it. Also make
+    sure that all directories defined in the config exist.
+
+    path (Path): The path to the project directory.
+    interpolate (bool): Whether to substitute project variables.
+    overrides (Dict[str, Any]): Optional config overrides.
+    RETURNS (Dict[str, Any]): The loaded project.yml.
+    """
+    config_path = path / PROJECT_FILE
+    if not config_path.exists():
+        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
+    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
+    try:
+        config = srsly.read_yaml(config_path)
+    except ValueError as e:
+        msg.fail(invalid_err, e, exits=1)
+    errors = validate(ProjectConfigSchema, config)
+    if errors:
+        msg.fail(invalid_err)
+        print("\n".join(errors))
+        sys.exit(1)
+    validate_project_version(config)
+    validate_project_commands(config)
+    if interpolate:
+        err = f"{PROJECT_FILE} validation error"
+        with show_validation_error(title=err, hint_fill=False):
+            config = substitute_project_variables(config, overrides)
+    # Make sure directories defined in config exist
+    for subdir in config.get("directories", []):
+        dir_path = path / subdir
+        if not dir_path.exists():
+            dir_path.mkdir(parents=True)
+    return config
+
+
+def substitute_project_variables(
+    config: Dict[str, Any],
+    overrides: Dict[str, Any] = SimpleFrozenDict(),
+    key: str = "vars",
+    env_key: str = "env",
+) -> Dict[str, Any]:
+    """Interpolate variables in the project file using the config system.
+
+    config (Dict[str, Any]): The project config.
+    overrides (Dict[str, Any]): Optional config overrides.
+    key (str): Key containing variables in project config.
+    env_key (str): Key containing environment variable mapping in project config.
+    RETURNS (Dict[str, Any]): The interpolated project config.
+    """
+    config.setdefault(key, {})
+    config.setdefault(env_key, {})
+    # Substitute references to env vars with their values
+    for config_var, env_var in config[env_key].items():
+        config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
+    # Need to put variables in the top scope again so we can have a top-level
+    # section "project" (otherwise, a list of commands in the top scope wouldn't)
+    # be allowed by Thinc's config system
+    cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
+    cfg = Config().from_str(cfg.to_str(), overrides=overrides)
+    interpolated = cfg.interpolate()
+    return dict(interpolated["project"])
+
+
+def validate_project_version(config: Dict[str, Any]) -> None:
+    """If the project defines a compatible spaCy version range, chec that it's
+    compatible with the current version of spaCy.
+
+    config (Dict[str, Any]): The loaded config.
+    """
+    spacy_version = config.get("spacy_version", None)
+    if spacy_version and not is_compatible_version(about.__version__, spacy_version):
+        err = (
+            f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
+            f"that's not compatible with the version of spaCy you're running "
+            f"({about.__version__}). You can edit version requirement in the "
+            f"{PROJECT_FILE} to load it, but the project may not run as expected."
+        )
+        msg.fail(err, exits=1)
+
+
+def validate_project_commands(config: Dict[str, Any]) -> None:
+    """Check that project commands and workflows are valid, don't contain
+    duplicates, don't clash  and only refer to commands that exist.
+
+    config (Dict[str, Any]): The loaded config.
+    """
+    command_names = [cmd["name"] for cmd in config.get("commands", [])]
+    workflows = config.get("workflows", {})
+    duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
+    if duplicates:
+        err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
+        msg.fail(err, exits=1)
+    for workflow_name, workflow_steps in workflows.items():
+        if workflow_name in command_names:
+            err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
+            msg.fail(err, exits=1)
+        for step in workflow_steps:
+            if step not in command_names:
+                msg.fail(
+                    f"Unknown command specified in workflow '{workflow_name}': {step}",
+                    f"Workflows can only refer to commands defined in the 'commands' "
+                    f"section of the {PROJECT_FILE}.",
+                    exits=1,
+                )
+
+
+def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
+    """Get the hash for a JSON-serializable object.
+
+    data: The data to hash.
+    exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
+    RETURNS (str): The hash.
+    """
+    if isinstance(data, dict):
+        data = {k: v for k, v in data.items() if k not in exclude}
+    data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
+    return hashlib.md5(data_str).hexdigest()
+
+
+def get_checksum(path: Union[Path, str]) -> str:
+    """Get the checksum for a file or directory given its file path. If a
+    directory path is provided, this uses all files in that directory.
+
+    path (Union[Path, str]): The file or directory path.
+    RETURNS (str): The checksum.
+    """
+    path = Path(path)
+    if not (path.is_file() or path.is_dir()):
+        msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
+    if path.is_file():
+        return hashlib.md5(Path(path).read_bytes()).hexdigest()
+    else:
+        # TODO: this is currently pretty slow
+        dir_checksum = hashlib.md5()
+        for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
+            dir_checksum.update(sub_file.read_bytes())
+        return dir_checksum.hexdigest()
+
+
 @contextmanager
 def show_validation_error(
     file_path: Optional[Union[str, Path]] = None,
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index a66a68133b3..3844b340678 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -8,6 +8,8 @@
 import srsly
 from wasabi import Printer
 
+from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
+from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training import docs_to_json
 from ..training.converters import (
@@ -116,6 +118,10 @@ def convert(
     input_path = Path(input_path)
     if not msg:
         msg = Printer(no_print=silent)
+
+    # Throw error for renamed language codes in v4
+    _handle_renamed_language_codes(lang)
+
     ner_map = srsly.read_json(ner_map) if ner_map is not None else None
     doc_files = []
     for input_loc in walk_directory(input_path, converter):
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index 129b5a24e84..b29a2b748f2 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -12,15 +12,9 @@
 from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
-from ._util import (
-    COMMAND,
-    Arg,
-    Opt,
-    import_code,
-    init_cli,
-    show_validation_error,
-    string_to_list,
-)
+from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
+from ._util import string_to_list, import_code, _handle_renamed_language_codes
+
 
 ROOT = Path(__file__).parent / "templates"
 TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
@@ -50,7 +44,7 @@ class InitValues:
 def init_config_cli(
     # fmt: off
     output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
-    lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
+    lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"),
     pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
     optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
     gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
@@ -176,6 +170,10 @@ def init_config(
     msg = Printer(no_print=silent)
     with TEMPLATE_PATH.open("r") as f:
         template = Template(f.read())
+
+    # Throw error for renamed language codes in v4
+    _handle_renamed_language_codes(lang)
+
     # Filter out duplicates since tok2vec and transformer are added by template
     pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
     defaults = RECOMMENDATIONS["__default__"]
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 21eea8edf2f..0ff39d2145b 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -8,16 +8,8 @@
 
 from .. import util
 from ..language import Language
-from ..training.initialize import convert_vectors, init_nlp
-from ._util import (
-    Arg,
-    Opt,
-    import_code,
-    init_cli,
-    parse_config_overrides,
-    setup_gpu,
-    show_validation_error,
-)
+from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code, setup_gpu, _handle_renamed_language_codes
 
 
 @init_cli.command("vectors")
@@ -39,8 +31,11 @@ def init_vectors_cli(
     you can use in the [initialize] block of your config to initialize
     a model with vectors.
     """
-    if verbose:
-        util.logger.setLevel(logging.DEBUG)
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+
+    # Throw error for renamed language codes in v4
+    _handle_renamed_language_codes(lang)
+
     msg.info(f"Creating blank nlp object for language '{lang}'")
     nlp = util.get_lang_class(lang)()
     if jsonl_loc is not None:
diff --git a/spacy/errors.py b/spacy/errors.py
index dcf8e60b7a1..c8c595395b3 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -981,6 +981,7 @@ class Errors(metaclass=ErrorsWithCodes):
              "reference and predicted docs.")
     E4004 = ("Backprop is not supported when is_train is not set.")
 
+RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
 # fmt: on
 
diff --git a/spacy/lang/is/__init__.py b/spacy/lang/isl/__init__.py
similarity index 93%
rename from spacy/lang/is/__init__.py
rename to spacy/lang/isl/__init__.py
index af126004536..50929620ced 100644
--- a/spacy/lang/is/__init__.py
+++ b/spacy/lang/isl/__init__.py
@@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults):
 
 
 class Icelandic(Language):
-    lang = "is"
+    lang = "isl"
     Defaults = IcelandicDefaults
 
 
diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/isl/stop_words.py
similarity index 100%
rename from spacy/lang/is/stop_words.py
rename to spacy/lang/isl/stop_words.py
diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/mul/__init__.py
similarity index 67%
rename from spacy/lang/xx/__init__.py
rename to spacy/lang/mul/__init__.py
index aff8403ffc7..5170f1e861f 100644
--- a/spacy/lang/xx/__init__.py
+++ b/spacy/lang/mul/__init__.py
@@ -3,10 +3,10 @@
 
 class MultiLanguage(Language):
     """Language class to be used for models that support multiple languages.
-    This module allows models to specify their language ID as 'xx'.
+    This module allows models to specify their language ID as 'mul'.
     """
 
-    lang = "xx"
+    lang = "mul"
 
 
 __all__ = ["MultiLanguage"]
diff --git a/spacy/lang/xx/examples.py b/spacy/lang/mul/examples.py
similarity index 100%
rename from spacy/lang/xx/examples.py
rename to spacy/lang/mul/examples.py
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 9ab116deb3f..b590f86337e 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -114,7 +114,7 @@ class Scorer:
     def __init__(
         self,
         nlp: Optional["Language"] = None,
-        default_lang: str = "xx",
+        default_lang: str = "mul",
         default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
         **cfg,
     ) -> None:
diff --git a/spacy/tests/README.md b/spacy/tests/README.md
index f3c96a39e7c..9ac1e6d2e34 100644
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@@ -86,7 +86,7 @@ These are the main fixtures that are currently available:
 
 | Fixture                             | Description                                                                  |
 | ----------------------------------- | ---------------------------------------------------------------------------- |
-| `tokenizer`                         | Basic, language-independent tokenizer. Identical to the `xx` language class. |
+| `tokenizer`                         | Basic, language-independent tokenizer. Identical to the `mul` language class. |
 | `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer.                                   |
 | `en_vocab`                          | Creates an instance of the English `Vocab`.                                  |
 
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 6085b89cf02..fdc9f192c2f 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -85,7 +85,7 @@ def register_cython_tests(cython_mod_name: str, test_mod_name: str):
 
 @pytest.fixture(scope="module")
 def tokenizer():
-    return get_lang_class("xx")().tokenizer
+    return get_lang_class("mul")().tokenizer
 
 
 @pytest.fixture(scope="session")
@@ -250,8 +250,8 @@ def id_tokenizer():
 
 
 @pytest.fixture(scope="session")
-def is_tokenizer():
-    return get_lang_class("is")().tokenizer
+def isl_tokenizer():
+    return get_lang_class("isl")().tokenizer
 
 
 @pytest.fixture(scope="session")
@@ -513,8 +513,8 @@ def vi_tokenizer():
 
 
 @pytest.fixture(scope="session")
-def xx_tokenizer():
-    return get_lang_class("xx")().tokenizer
+def mul_tokenizer():
+    return get_lang_class("mul")().tokenizer
 
 
 @pytest.fixture(scope="session")
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 946910b29e1..518db02e6b3 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -18,7 +18,7 @@
     TAG,
 )
 from spacy.lang.en import English
-from spacy.lang.xx import MultiLanguage
+from spacy.lang.mul import MultiLanguage
 from spacy.language import Language
 from spacy.lexeme import Lexeme
 from spacy.tokens import Doc, Span, SpanGroup, Token
diff --git a/spacy/tests/lang/is/__init__.py b/spacy/tests/lang/isl/__init__.py
similarity index 100%
rename from spacy/tests/lang/is/__init__.py
rename to spacy/tests/lang/isl/__init__.py
diff --git a/spacy/tests/lang/is/test_text.py b/spacy/tests/lang/isl/test_text.py
similarity index 85%
rename from spacy/tests/lang/is/test_text.py
rename to spacy/tests/lang/isl/test_text.py
index 6e3654a6eda..9e177485d09 100644
--- a/spacy/tests/lang/is/test_text.py
+++ b/spacy/tests/lang/isl/test_text.py
@@ -1,7 +1,7 @@
 import pytest
 
 
-def test_long_text(is_tokenizer):
+def test_long_text(isl_tokenizer):
     # Excerpt: European Convention on Human Rights
     text = """
 hafa í huga, að yfirlýsing þessi hefur það markmið að tryggja
@@ -15,12 +15,12 @@ def test_long_text(is_tokenizer):
 virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi
 og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins;
 """
-    tokens = is_tokenizer(text)
+    tokens = isl_tokenizer(text)
     assert len(tokens) == 120
 
 
 @pytest.mark.xfail
-def test_ordinal_number(is_tokenizer):
+def test_ordinal_number(isl_tokenizer):
     text = "10. desember 1948"
-    tokens = is_tokenizer(text)
+    tokens = isl_tokenizer(text)
     assert len(tokens) == 3
diff --git a/spacy/tests/lang/is/test_tokenizer.py b/spacy/tests/lang/isl/test_tokenizer.py
similarity index 72%
rename from spacy/tests/lang/is/test_tokenizer.py
rename to spacy/tests/lang/isl/test_tokenizer.py
index 0c05a605001..ba534aaf662 100644
--- a/spacy/tests/lang/is/test_tokenizer.py
+++ b/spacy/tests/lang/isl/test_tokenizer.py
@@ -1,6 +1,6 @@
 import pytest
 
-IS_BASIC_TOKENIZATION_TESTS = [
+ISL_BASIC_TOKENIZATION_TESTS = [
     (
         "Enginn maður skal sæta pyndingum eða ómannlegri eða "
         "vanvirðandi meðferð eða refsingu. ",
@@ -23,8 +23,8 @@
 ]
 
 
-@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS)
-def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens):
-    tokens = is_tokenizer(text)
+@pytest.mark.parametrize("text,expected_tokens", ISL_BASIC_TOKENIZATION_TESTS)
+def test_isl_tokenizer_basic(isl_tokenizer, text, expected_tokens):
+    tokens = isl_tokenizer(text)
     token_list = [token.text for token in tokens if not token.is_space]
     assert expected_tokens == token_list
diff --git a/spacy/tests/lang/xx/__init__.py b/spacy/tests/lang/mul/__init__.py
similarity index 100%
rename from spacy/tests/lang/xx/__init__.py
rename to spacy/tests/lang/mul/__init__.py
diff --git a/spacy/tests/lang/xx/test_text.py b/spacy/tests/lang/mul/test_text.py
similarity index 96%
rename from spacy/tests/lang/xx/test_text.py
rename to spacy/tests/lang/mul/test_text.py
index 477f0ebe271..6e4262d6696 100644
--- a/spacy/tests/lang/xx/test_text.py
+++ b/spacy/tests/lang/mul/test_text.py
@@ -1,7 +1,7 @@
 import pytest
 
 
-def test_long_text(xx_tokenizer):
+def test_long_text(mul_tokenizer):
     # Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi
     text = """
 Säʹmmla lie Euroopp unioon oʹdinakai alggmeer. Säʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest.  
@@ -20,5 +20,5 @@ def test_long_text(xx_tokenizer):
 Sääʹmteʹǧǧ.
 """
 
-    tokens = xx_tokenizer(text)
+    tokens = mul_tokenizer(text)
     assert len(tokens) == 179
diff --git a/spacy/tests/lang/xx/test_tokenizer.py b/spacy/tests/lang/mul/test_tokenizer.py
similarity index 68%
rename from spacy/tests/lang/xx/test_tokenizer.py
rename to spacy/tests/lang/mul/test_tokenizer.py
index 15c760a6b85..3d06dc11cf7 100644
--- a/spacy/tests/lang/xx/test_tokenizer.py
+++ b/spacy/tests/lang/mul/test_tokenizer.py
@@ -1,6 +1,6 @@
 import pytest
 
-XX_BASIC_TOKENIZATION_TESTS = [
+MUL_BASIC_TOKENIZATION_TESTS = [
     (
         "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
         [
@@ -18,8 +18,8 @@
 ]
 
 
-@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS)
-def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens):
-    tokens = xx_tokenizer(text)
+@pytest.mark.parametrize("text,expected_tokens", MUL_BASIC_TOKENIZATION_TESTS)
+def test_mul_tokenizer_basic(mul_tokenizer, text, expected_tokens):
+    tokens = mul_tokenizer(text)
     token_list = [token.text for token in tokens if not token.is_space]
     assert expected_tokens == token_list
diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py
index 8a158647a69..e0fd534d317 100644
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@@ -7,10 +7,10 @@
 # excluded: ja, ko, th, vi, zh
 LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
              "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
-             "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
-             "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
+             "hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv",
+             "mk", "ml", "mr", "mul", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
              "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
-             "tr", "tt", "uk", "ur", "xx", "yo"]
+             "tr", "tt", "uk", "ur", "yo"]
 # fmt: on
 
 
diff --git a/spacy/tests/pipeline/test_span_ruler.py b/spacy/tests/pipeline/test_span_ruler.py
index 0a8616f449b..3dfbccf28e2 100644
--- a/spacy/tests/pipeline/test_span_ruler.py
+++ b/spacy/tests/pipeline/test_span_ruler.py
@@ -46,7 +46,7 @@ def person_org_date_patterns(person_org_patterns):
 
 def test_span_ruler_add_empty(patterns):
     """Test that patterns don't get added excessively."""
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"validate": True})
     ruler.add_patterns(patterns)
     pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
@@ -57,7 +57,7 @@ def test_span_ruler_add_empty(patterns):
 
 
 def test_span_ruler_init(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
@@ -73,7 +73,7 @@ def test_span_ruler_init(patterns):
 
 
 def test_span_ruler_no_patterns_warns():
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     assert len(ruler) == 0
     assert len(ruler.labels) == 0
@@ -85,7 +85,7 @@ def test_span_ruler_no_patterns_warns():
 
 def test_span_ruler_init_patterns(patterns):
     # initialize with patterns
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     assert len(ruler.labels) == 0
     ruler.initialize(lambda: [], patterns=patterns)
@@ -109,7 +109,7 @@ def test_span_ruler_init_patterns(patterns):
 
 def test_span_ruler_init_clear(patterns):
     """Test that initialization clears patterns."""
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler.labels) == 4
@@ -118,7 +118,7 @@ def test_span_ruler_init_clear(patterns):
 
 
 def test_span_ruler_clear(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler.labels) == 4
@@ -132,7 +132,7 @@ def test_span_ruler_clear(patterns):
 
 
 def test_span_ruler_existing(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"overwrite": False})
     ruler.add_patterns(patterns)
     doc = nlp.make_doc("OH HELLO WORLD bye bye")
@@ -147,7 +147,7 @@ def test_span_ruler_existing(patterns):
 
 
 def test_span_ruler_existing_overwrite(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
     ruler.add_patterns(patterns)
     doc = nlp.make_doc("OH HELLO WORLD bye bye")
@@ -160,13 +160,13 @@ def test_span_ruler_existing_overwrite(patterns):
 
 
 def test_span_ruler_serialize_bytes(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     ruler_bytes = ruler.to_bytes()
-    new_nlp = spacy.blank("xx")
+    new_nlp = spacy.blank("mul")
     new_ruler = new_nlp.add_pipe("span_ruler")
     assert len(new_ruler) == 0
     assert len(new_ruler.labels) == 0
@@ -180,7 +180,7 @@ def test_span_ruler_serialize_bytes(patterns):
 
 
 def test_span_ruler_validate():
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     validated_ruler = nlp.add_pipe(
         "span_ruler", name="validated_span_ruler", config={"validate": True}
@@ -202,14 +202,14 @@ def test_span_ruler_validate():
 
 
 def test_span_ruler_properties(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
     ruler.add_patterns(patterns)
     assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns]))
 
 
 def test_span_ruler_overlapping_spans(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(overlapping_patterns)
     doc = ruler(nlp.make_doc("foo bar baz"))
@@ -219,7 +219,7 @@ def test_span_ruler_overlapping_spans(overlapping_patterns):
 
 
 def test_span_ruler_scorer(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(overlapping_patterns)
     text = "foo bar baz"
@@ -242,7 +242,7 @@ def test_span_ruler_multiprocessing(n_process):
 
         patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}]
 
-        nlp = spacy.blank("xx")
+        nlp = spacy.blank("mul")
         ruler = nlp.add_pipe("span_ruler")
         ruler.add_patterns(patterns)
 
@@ -252,7 +252,7 @@ def test_span_ruler_multiprocessing(n_process):
 
 
 def test_span_ruler_serialize_dir(patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(patterns)
     with make_tempdir() as d:
@@ -263,7 +263,7 @@ def test_span_ruler_serialize_dir(patterns):
 
 
 def test_span_ruler_remove_basic(person_org_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_patterns)
     doc = ruler(nlp.make_doc("Dina went to school"))
@@ -278,7 +278,7 @@ def test_span_ruler_remove_basic(person_org_patterns):
 
 
 def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_patterns)
     assert len(ruler.patterns) == 3
@@ -289,7 +289,7 @@ def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
 
 
 def test_span_ruler_remove_several_patterns(person_org_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_patterns)
     doc = ruler(nlp.make_doc("Dina founded the company ACME."))
@@ -313,7 +313,7 @@ def test_span_ruler_remove_several_patterns(person_org_patterns):
 
 
 def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_date_patterns)
     doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th"))
@@ -331,7 +331,7 @@ def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
 
 
 def test_span_ruler_remove_all_patterns(person_org_date_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     ruler.add_patterns(person_org_date_patterns)
     assert len(ruler.patterns) == 4
@@ -347,7 +347,7 @@ def test_span_ruler_remove_all_patterns(person_org_date_patterns):
 
 
 def test_span_ruler_remove_and_add():
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler")
     patterns1 = [{"label": "DATE1", "pattern": "last time"}]
     ruler.add_patterns(patterns1)
@@ -403,7 +403,7 @@ def test_span_ruler_remove_and_add():
 
 
 def test_span_ruler_spans_filter(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe(
         "span_ruler",
         config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}},
@@ -415,7 +415,7 @@ def test_span_ruler_spans_filter(overlapping_patterns):
 
 
 def test_span_ruler_ents_default_filter(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True})
     ruler.add_patterns(overlapping_patterns)
     doc = ruler(nlp.make_doc("foo bar baz"))
@@ -424,7 +424,7 @@ def test_span_ruler_ents_default_filter(overlapping_patterns):
 
 
 def test_span_ruler_ents_overwrite_filter(overlapping_patterns):
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe(
         "span_ruler",
         config={
@@ -451,7 +451,7 @@ def pass_through_filter(spans1, spans2):
 
         return pass_through_filter
 
-    nlp = spacy.blank("xx")
+    nlp = spacy.blank("mul")
     ruler = nlp.add_pipe(
         "span_ruler",
         config={
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 8138cb157d2..b419d77b51d 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -666,11 +666,12 @@ def test_spacy_blank():
         ("fra", "fr"),
         ("fre", "fr"),
         ("iw", "he"),
+        ("is", "isl"),
         ("mo", "ro"),
-        ("mul", "xx"),
+        ("mul", "mul"),
         ("no", "nb"),
         ("pt-BR", "pt"),
-        ("xx", "xx"),
+        ("xx", "mul"),
         ("zh-Hans", "zh"),
         ("zh-Hant", None),
         ("zxx", None),
@@ -691,11 +692,11 @@ def test_language_matching(lang, target):
         ("fra", "fr"),
         ("fre", "fr"),
         ("iw", "he"),
+        ("is", "isl"),
         ("mo", "ro"),
-        ("mul", "xx"),
+        ("xx", "mul"),
         ("no", "nb"),
         ("pt-BR", "pt"),
-        ("xx", "xx"),
         ("zh-Hans", "zh"),
     ],
 )
diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py
index 78932f6539c..073899fa50a 100644
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@@ -36,6 +36,7 @@
     "hu",
     pytest.param("id", marks=pytest.mark.slow()),
     pytest.param("it", marks=pytest.mark.slow()),
+    pytest.param("isl", marks=pytest.mark.slow()),
     pytest.param("kn", marks=pytest.mark.slow()),
     pytest.param("lb", marks=pytest.mark.slow()),
     pytest.param("lt", marks=pytest.mark.slow()),
diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py
index b19d1791b27..c3490d4a494 100644
--- a/spacy/training/converters/conll_ner_to_docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@@ -86,7 +86,7 @@ def conll_ner_to_docs(
     if model:
         nlp = load_model(model)
     else:
-        nlp = get_lang_class("xx")()
+        nlp = get_lang_class("mul")()
     for conll_doc in input_data.strip().split(doc_delimiter):
         conll_doc = conll_doc.strip()
         if not conll_doc:
@@ -133,7 +133,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
             "Segmenting sentences with sentencizer. (Use `-b model` for "
             "improved parser-based sentence segmentation.)"
         )
-        nlp = get_lang_class("xx")()
+        nlp = get_lang_class("mul")()
         sentencizer = nlp.create_pipe("sentencizer")
     lines = doc.strip().split("\n")
     words = [line.strip().split()[0] for line in lines]
diff --git a/spacy/training/converters/json_to_docs.py b/spacy/training/converters/json_to_docs.py
index b4beedd2f27..1ff7a64e09d 100644
--- a/spacy/training/converters/json_to_docs.py
+++ b/spacy/training/converters/json_to_docs.py
@@ -1,13 +1,9 @@
 import srsly
-
-from ...lang.xx import MultiLanguage
-from ...util import load_model
-from ..example import (
-    _fix_legacy_dict_data,
-    _parse_example_dict_data,
-    annotations_to_doc,
-)
 from ..gold_io import json_iterate, json_to_annotations
+from ..example import annotations_to_doc
+from ..example import _fix_legacy_dict_data, _parse_example_dict_data
+from ...util import load_model
+from ...lang.mul import MultiLanguage
 
 
 def json_to_docs(input_data, model=None, **kwargs):
diff --git a/spacy/util.py b/spacy/util.py
index de04ee6e718..8c402a74ce9 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -325,7 +325,7 @@ def find_matching_language(lang: str) -> Optional[str]:
     import spacy.lang  # noqa: F401
 
     if lang == "xx":
-        return "xx"
+        return "mul"
 
     # Find out which language modules we have
     possible_languages = []
@@ -343,11 +343,7 @@ def find_matching_language(lang: str) -> Optional[str]:
     # is labeled that way is probably trying to be distinct from 'zh' and
     # shouldn't automatically match.
     match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
-    if match == "mul":
-        # Convert 'mul' back to spaCy's 'xx'
-        return "xx"
-    else:
-        return match
+    return match
 
 
 def get_lang_class(lang: str) -> Type["Language"]:
diff --git a/website/docs/api/scorer.mdx b/website/docs/api/scorer.mdx
index 9bdd0a8f435..0c2eefc6722 100644
--- a/website/docs/api/scorer.mdx
+++ b/website/docs/api/scorer.mdx
@@ -30,7 +30,7 @@ Create a new `Scorer`.
 | Name               | Description                                                                                                                                                                                                                               |
 | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `nlp`              | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ |
-| `default_lang`     | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~                                                                                                                                            |
+| `default_lang`     | The language to use for a default pipeline if `nlp` is not provided. Defaults to `mul`. ~~str~~                                                                                                                                           |
 | `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~                                                     |
 | _keyword-only_     |                                                                                                                                                                                                                                           |
 | `**kwargs`         | Any additional settings to pass on to the individual scoring methods. ~~Any~~                                                                                                                                                             |
diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx
index e74c37e3080..34927ff3e7b 100644
--- a/website/docs/usage/models.mdx
+++ b/website/docs/usage/models.mdx
@@ -74,23 +74,23 @@ your data.
 
 > ```python
 > # Standard import
-> from spacy.lang.xx import MultiLanguage
+> from spacy.lang.mul import MultiLanguage
 > nlp = MultiLanguage()
 >
 > # With lazy-loading
-> nlp = spacy.blank("xx")
+> nlp = spacy.blank("mul")
 > ```
 
 spaCy also supports pipelines trained on more than one language. This is
 especially useful for named entity recognition. The language ID used for
-multi-language or language-neutral pipelines is `xx`. The language class, a
+multi-language or language-neutral pipelines is `mul`. The language class, a
 generic subclass containing only the base language data, can be found in
-[`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx).
+[`lang/mul`](%%GITHUB_SPACY/spacy/lang/mul).
 
 To train a pipeline using the neutral multi-language class, you can set
-`lang = "xx"` in your [training config](/usage/training#config). You can also
+`lang = "mul"` in your [training config](/usage/training#config). You can also
 \import the `MultiLanguage` class directly, or call
-[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
+[`spacy.blank("mul")`](/api/top-level#spacy.blank) for lazy-loading.
 
 ### Chinese language support {id="chinese",version="2.3"}
 
diff --git a/website/meta/languages.json b/website/meta/languages.json
index d6a07809795..e520067ba20 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -169,7 +169,7 @@
             "has_examples": true
         },
         {
-            "code": "is",
+            "code": "isl",
             "name": "Icelandic"
         },
         {
@@ -454,9 +454,9 @@
             ]
         },
         {
-            "code": "xx",
+            "code": "mul",
             "name": "Multi-language",
-            "models": ["xx_ent_wiki_sm", "xx_sent_ud_sm"],
+            "models": ["mul_ent_wiki_sm", "mul_sent_ud_sm"],
             "example": "This is a sentence about Facebook."
         },
         {
diff --git a/website/src/widgets/quickstart-models.js b/website/src/widgets/quickstart-models.js
index b2a0a628018..4994dc22640 100644
--- a/website/src/widgets/quickstart-models.js
+++ b/website/src/widgets/quickstart-models.js
@@ -103,7 +103,7 @@ const QuickstartInstall = ({ id, title, description, children }) => {
                         </QS>
                         <QS config="example" prompt="python">
                             print([
-                            {code === 'xx'
+                            {code === 'mul'
                                 ? '(ent.text, ent.label) for ent in doc.ents'
                                 : '(w.text, w.pos_) for w in doc'}
                             ])

From f86d9fe4f7aeb21dc82ed32e5781c3a28e9daf7b Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 1 Feb 2023 17:47:56 +0900
Subject: [PATCH 271/504] Move Entity Linker v1 to spacy-legacy (#12006)

* Move Entity Linker v1 component to spacy-legacy

This is a follow up to #11889 that moves the component instead of
removing it.

In general, we never import from spacy-legacy in spaCy proper. However,
to use this component, that kind of import will be necessary. I was able
to test this without issues, but is this current import strategy
acceptable? Or should we put the component in a registry?

* Use spacy-legacy pr for CI

This will need to be reverted before merging.

* Add temporary step to log installed spacy-legacy version

* Modify requirements.txt to trigger tests

* Add comment to Python to trigger tests

* TODO REVERT This is a commit with logic changes to trigger tests

* Remove pipe from YAML

Works locally, but possibly this is causing a quoting error or
something.

* Revert "TODO REVERT This is a commit with logic changes to trigger tests"

This reverts commit 689fae71f31de4f54a00dd7dae0c26b19563c027.

* Revert "Add comment to Python to trigger tests"

This reverts commit 11840fc59886658c59aeb186a20173f5ec7c4583.

* Add more logging

* Try installing directly in workflow

* Try explicitly uninstalling spacy-legacy first

* Cat requirements.txt to confirm contents

In the branch, the thinc version spec is `thinc>=8.1.0,<8.2.0`. But in
the logs, it's clear that a development release of 9.0 is being
installed. It's not clear why that would happen.

* Log requirements at start of build

* TODO REVERT Change thinc spec

Want to see what happens to the installed thinc spec with this change.

* Update thinc requirements

This makes it the same as it was before the merge, >=8.1.0,<8.2.0.

* Use same thinc version as v4 branch

* TODO REVERT Mark dependency check as xfail

spacy-legacy is specified as a git checkout in requirements.txt while
this PR is in progress, which makes the consistency check here fail.

* Remove debugging output / install step

* Revert "Remove debugging output / install step"

This reverts commit 923ea7448b5e819d73272bc4e43e8880a8598a07.

* Clean up debugging output

The manual install step with the URL fragment seems to have caused
issues on Windows due to the = in the URL being misinterpreted. On the
other hand, removing it seems to mean the git version of spacy-legacy
isn't actually installed.

This PR removes the URL fragment but keeps the direct command-line
install. Additionally, since it looks like this job is configured to use
the default shell (and not bash), it removes a comment that upsets the
Windows cmd shell.

* Revert "TODO REVERT Mark dependency check as xfail"

This reverts commit d4863ec1563b7819c31a865cb94262b7dc592b7e.

* Fix requirements.txt, increasing spacy-legacy version

* Raise spacy legacy version in setup.cfg

* Remove azure build workarounds

* make spacy-legacy version explicit in error message

* Remove debugging line

* Suggestions from code review
---
 spacy/pipeline/entity_linker.py            |  16 +
 spacy/pipeline/legacy/__init__.py          |   3 -
 spacy/pipeline/legacy/entity_linker.py     | 422 ---------------------
 spacy/tests/pipeline/test_entity_linker.py |   3 +-
 4 files changed, 18 insertions(+), 426 deletions(-)
 delete mode 100644 spacy/pipeline/legacy/__init__.py
 delete mode 100644 spacy/pipeline/legacy/entity_linker.py

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 6d041b7de3d..acf1add0599 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -14,6 +14,16 @@
 from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate
 from thinc.types import Floats2d
 
+from ..kb import KnowledgeBase, Candidate
+from ..ml import empty_kb
+from ..tokens import Doc, Span
+from .pipe import deserialize_config
+from .trainable_pipe import TrainablePipe
+from ..language import Language
+from ..vocab import Vocab
+from ..training import Example, validate_examples, validate_get_examples
+from ..errors import Errors
+from ..util import SimpleFrozenList, registry
 from .. import util
 from ..errors import Errors
 from ..kb import Candidate, KnowledgeBase
@@ -127,6 +137,12 @@ def make_entity_linker(
     """
 
     if not model.attrs.get("include_span_maker", False):
+        try:
+            from spacy_legacy.components.entity_linker import EntityLinker_v1
+        except:
+            raise ImportError(
+                "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
+            )
         # The only difference in arguments here is that use_gold_ents and threshold aren't available.
         return EntityLinker_v1(
             nlp.vocab,
diff --git a/spacy/pipeline/legacy/__init__.py b/spacy/pipeline/legacy/__init__.py
deleted file mode 100644
index f216840dc2c..00000000000
--- a/spacy/pipeline/legacy/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .entity_linker import EntityLinker_v1
-
-__all__ = ["EntityLinker_v1"]
diff --git a/spacy/pipeline/legacy/entity_linker.py b/spacy/pipeline/legacy/entity_linker.py
deleted file mode 100644
index 1e46db019d5..00000000000
--- a/spacy/pipeline/legacy/entity_linker.py
+++ /dev/null
@@ -1,422 +0,0 @@
-# This file is present to provide a prior version of the EntityLinker component
-# for backwards compatability. For details see #9669.
-
-import random
-import warnings
-from itertools import islice
-from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
-
-import srsly
-from thinc.api import CosineDistance, Model, Optimizer, set_dropout_rate
-from thinc.types import Floats2d
-
-from ... import util
-from ...errors import Errors, Warnings
-from ...kb import Candidate, KnowledgeBase
-from ...language import Language
-from ...ml import empty_kb
-from ...scorer import Scorer
-from ...tokens import Doc, Span
-from ...training import Example, validate_examples, validate_get_examples
-from ...util import SimpleFrozenList
-from ...vocab import Vocab
-from ..pipe import deserialize_config
-from ..trainable_pipe import TrainablePipe
-
-# See #9050
-BACKWARD_OVERWRITE = True
-
-
-def entity_linker_score(examples, **kwargs):
-    return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs)
-
-
-class EntityLinker_v1(TrainablePipe):
-    """Pipeline component for named entity linking.
-
-    DOCS: https://spacy.io/api/entitylinker
-    """
-
-    NIL = "NIL"  # string used to refer to a non-existing link
-
-    def __init__(
-        self,
-        vocab: Vocab,
-        model: Model,
-        name: str = "entity_linker",
-        *,
-        labels_discard: Iterable[str],
-        n_sents: int,
-        incl_prior: bool,
-        incl_context: bool,
-        entity_vector_length: int,
-        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
-        overwrite: bool = BACKWARD_OVERWRITE,
-        scorer: Optional[Callable] = entity_linker_score,
-    ) -> None:
-        """Initialize an entity linker.
-
-        vocab (Vocab): The shared vocabulary.
-        model (thinc.api.Model): The Thinc Model powering the pipeline component.
-        name (str): The component instance name, used to add entries to the
-            losses during training.
-        labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
-        n_sents (int): The number of neighbouring sentences to take into account.
-        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
-        incl_context (bool): Whether or not to include the local context in the model.
-        entity_vector_length (int): Size of encoding vectors in the KB.
-        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
-            produces a list of candidates, given a certain knowledge base and a textual mention.
-        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
-        DOCS: https://spacy.io/api/entitylinker#init
-        """
-        self.vocab = vocab
-        self.model = model
-        self.name = name
-        self.labels_discard = list(labels_discard)
-        self.n_sents = n_sents
-        self.incl_prior = incl_prior
-        self.incl_context = incl_context
-        self.get_candidates = get_candidates
-        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
-        self.distance = CosineDistance(normalize=False)
-        # how many neighbour sentences to take into account
-        # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
-        self.kb = empty_kb(entity_vector_length)(self.vocab)
-        self.scorer = scorer
-
-    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
-        """Define the KB of this pipe by providing a function that will
-        create it using this object's vocab."""
-        if not callable(kb_loader):
-            raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
-
-        self.kb = kb_loader(self.vocab)
-
-    def validate_kb(self) -> None:
-        # Raise an error if the knowledge base is not initialized.
-        if self.kb is None:
-            raise ValueError(Errors.E1018.format(name=self.name))
-        if len(self.kb) == 0:
-            raise ValueError(Errors.E139.format(name=self.name))
-
-    def initialize(
-        self,
-        get_examples: Callable[[], Iterable[Example]],
-        *,
-        nlp: Optional[Language] = None,
-        kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
-    ):
-        """Initialize the pipe for training, using a representative set
-        of data examples.
-
-        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns a representative sample of gold-standard Example objects.
-        nlp (Language): The current nlp object the component is part of.
-        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
-            Note that providing this argument, will overwrite all data accumulated in the current KB.
-            Use this only when loading a KB as-such from file.
-
-        DOCS: https://spacy.io/api/entitylinker#initialize
-        """
-        validate_get_examples(get_examples, "EntityLinker_v1.initialize")
-        if kb_loader is not None:
-            self.set_kb(kb_loader)
-        self.validate_kb()
-        nO = self.kb.entity_vector_length
-        doc_sample = []
-        vector_sample = []
-        for example in islice(get_examples(), 10):
-            doc_sample.append(example.x)
-            vector_sample.append(self.model.ops.alloc1f(nO))
-        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize(
-            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
-        )
-
-    def update(
-        self,
-        examples: Iterable[Example],
-        *,
-        drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
-        losses: Optional[Dict[str, float]] = None,
-    ) -> Dict[str, float]:
-        """Learn from a batch of documents and gold-standard information,
-        updating the pipe's model. Delegates to predict and get_loss.
-
-        examples (Iterable[Example]): A batch of Example objects.
-        drop (float): The dropout rate.
-        sgd (thinc.api.Optimizer): The optimizer.
-        losses (Dict[str, float]): Optional record of the loss during training.
-            Updated using the component name as the key.
-        RETURNS (Dict[str, float]): The updated losses dictionary.
-
-        DOCS: https://spacy.io/api/entitylinker#update
-        """
-        self.validate_kb()
-        if losses is None:
-            losses = {}
-        losses.setdefault(self.name, 0.0)
-        if not examples:
-            return losses
-        validate_examples(examples, "EntityLinker_v1.update")
-        sentence_docs = []
-        for eg in examples:
-            sentences = [s for s in eg.reference.sents]
-            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.reference.ents:
-                # KB ID of the first token is the same as the whole span
-                kb_id = kb_ids[ent.start]
-                if kb_id:
-                    try:
-                        # find the sentence in the list of sentences.
-                        sent_index = sentences.index(ent.sent)
-                    except AttributeError:
-                        # Catch the exception when ent.sent is None and provide a user-friendly warning
-                        raise RuntimeError(Errors.E030) from None
-                    # get n previous sentences, if there are any
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    # get n posterior sentences, or as many < n as there are
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    # get token positions
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    # append that span as a doc to training
-                    sent_doc = eg.predicted[start_token:end_token].as_doc()
-                    sentence_docs.append(sent_doc)
-        set_dropout_rate(self.model, drop)
-        if not sentence_docs:
-            warnings.warn(Warnings.W093.format(name="Entity Linker"))
-            return losses
-        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
-        loss, d_scores = self.get_loss(
-            sentence_encodings=sentence_encodings, examples=examples
-        )
-        bp_context(d_scores)
-        if sgd is not None:
-            self.finish_update(sgd)
-        losses[self.name] += loss
-        return losses
-
-    def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
-        validate_examples(examples, "EntityLinker_v1.get_loss")
-        entity_encodings = []
-        for eg in examples:
-            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.reference.ents:
-                kb_id = kb_ids[ent.start]
-                if kb_id:
-                    entity_encoding = self.kb.get_vector(kb_id)
-                    entity_encodings.append(entity_encoding)
-        entity_encodings = self.model.ops.asarray2f(entity_encodings)
-        if sentence_encodings.shape != entity_encodings.shape:
-            err = Errors.E147.format(
-                method="get_loss", msg="gold entities do not match up"
-            )
-            raise RuntimeError(err)
-        gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
-        loss = self.distance.get_loss(sentence_encodings, entity_encodings)
-        loss = loss / len(entity_encodings)
-        return float(loss), gradients
-
-    def predict(self, docs: Iterable[Doc]) -> List[str]:
-        """Apply the pipeline's model to a batch of docs, without modifying them.
-        Returns the KB IDs for each entity in each doc, including NIL if there is
-        no prediction.
-
-        docs (Iterable[Doc]): The documents to predict.
-        RETURNS (List[str]): The models prediction for each document.
-
-        DOCS: https://spacy.io/api/entitylinker#predict
-        """
-        self.validate_kb()
-        entity_count = 0
-        final_kb_ids: List[str] = []
-        if not docs:
-            return final_kb_ids
-        if isinstance(docs, Doc):
-            docs = [docs]
-        for i, doc in enumerate(docs):
-            sentences = [s for s in doc.sents]
-            if len(doc) > 0:
-                # Looping through each entity (TODO: rewrite)
-                for ent in doc.ents:
-                    sent = ent.sent
-                    sent_index = sentences.index(sent)
-                    assert sent_index >= 0
-                    # get n_neighbour sentences, clipped to the length of the document
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    sent_doc = doc[start_token:end_token].as_doc()
-                    # currently, the context is the same for each entity in a sentence (should be refined)
-                    xp = self.model.ops.xp
-                    if self.incl_context:
-                        sentence_encoding = self.model.predict([sent_doc])[0]
-                        sentence_encoding_t = sentence_encoding.T
-                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                    entity_count += 1
-                    if ent.label_ in self.labels_discard:
-                        # ignoring this entity - setting to NIL
-                        final_kb_ids.append(self.NIL)
-                    else:
-                        candidates = list(self.get_candidates(self.kb, ent))
-                        if not candidates:
-                            # no prediction possible for this entity - setting to NIL
-                            final_kb_ids.append(self.NIL)
-                        elif len(candidates) == 1:
-                            # shortcut for efficiency reasons: take the 1 candidate
-                            final_kb_ids.append(candidates[0].entity_)
-                        else:
-                            random.shuffle(candidates)
-                            # set all prior probabilities to 0 if incl_prior=False
-                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
-                            if not self.incl_prior:
-                                prior_probs = xp.asarray([0.0 for _ in candidates])
-                            scores = prior_probs
-                            # add in similarity from the context
-                            if self.incl_context:
-                                entity_encodings = xp.asarray(
-                                    [c.entity_vector for c in candidates]
-                                )
-                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
-                                if len(entity_encodings) != len(prior_probs):
-                                    raise RuntimeError(
-                                        Errors.E147.format(
-                                            method="predict",
-                                            msg="vectors not of equal length",
-                                        )
-                                    )
-                                # cosine similarity
-                                sims = xp.dot(entity_encodings, sentence_encoding_t) / (
-                                    sentence_norm * entity_norm
-                                )
-                                if sims.shape != prior_probs.shape:
-                                    raise ValueError(Errors.E161)
-                                scores = prior_probs + sims - (prior_probs * sims)
-                            best_index = scores.argmax().item()
-                            best_candidate = candidates[best_index]
-                            final_kb_ids.append(best_candidate.entity_)
-        if not (len(final_kb_ids) == entity_count):
-            err = Errors.E147.format(
-                method="predict", msg="result variables not of equal length"
-            )
-            raise RuntimeError(err)
-        return final_kb_ids
-
-    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
-        """Modify a batch of documents, using pre-computed scores.
-
-        docs (Iterable[Doc]): The documents to modify.
-        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
-
-        DOCS: https://spacy.io/api/entitylinker#set_annotations
-        """
-        count_ents = len([ent for doc in docs for ent in doc.ents])
-        if count_ents != len(kb_ids):
-            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
-        i = 0
-        overwrite = self.cfg["overwrite"]
-        for doc in docs:
-            for ent in doc.ents:
-                kb_id = kb_ids[i]
-                i += 1
-                for token in ent:
-                    if token.ent_kb_id == 0 or overwrite:
-                        token.ent_kb_id_ = kb_id
-
-    def to_bytes(self, *, exclude=tuple()):
-        """Serialize the pipe to a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
-
-        DOCS: https://spacy.io/api/entitylinker#to_bytes
-        """
-        self._validate_serialization_attrs()
-        serialize = {}
-        if hasattr(self, "cfg") and self.cfg is not None:
-            serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
-        serialize["kb"] = self.kb.to_bytes
-        serialize["model"] = self.model.to_bytes
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, *, exclude=tuple()):
-        """Load the pipe from a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (TrainablePipe): The loaded object.
-
-        DOCS: https://spacy.io/api/entitylinker#from_bytes
-        """
-        self._validate_serialization_attrs()
-
-        def load_model(b):
-            try:
-                self.model.from_bytes(b)
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize = {}
-        if hasattr(self, "cfg") and self.cfg is not None:
-            deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
-        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
-        deserialize["kb"] = lambda b: self.kb.from_bytes(b)
-        deserialize["model"] = load_model
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> None:
-        """Serialize the pipe to disk.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-
-        DOCS: https://spacy.io/api/entitylinker#to_disk
-        """
-        serialize = {}
-        serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
-        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
-        serialize["kb"] = lambda p: self.kb.to_disk(p)
-        serialize["model"] = lambda p: self.model.to_disk(p)
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityLinker_v1":
-        """Load the pipe from disk. Modifies the object in place and returns it.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (EntityLinker): The modified EntityLinker object.
-
-        DOCS: https://spacy.io/api/entitylinker#from_disk
-        """
-
-        def load_model(p):
-            try:
-                with p.open("rb") as infile:
-                    self.model.from_bytes(infile.read())
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize: Dict[str, Callable[[Any], Any]] = {}
-        deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
-        deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
-        deserialize["kb"] = lambda p: self.kb.from_disk(p)
-        deserialize["model"] = load_model
-        util.from_disk(path, deserialize, exclude)
-        return self
-
-    def rehearse(self, examples, *, sgd=None, losses=None, **config):
-        raise NotImplementedError
-
-    def add_label(self, label):
-        raise NotImplementedError
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 55726e401d3..8c1759e166e 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -11,7 +11,6 @@
 from spacy.lang.en import English
 from spacy.ml import load_kb
 from spacy.pipeline import EntityLinker, TrainablePipe
-from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tests.util import make_tempdir
@@ -1090,6 +1089,8 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
+    from spacy_legacy.components.entity_linker import EntityLinker_v1
+
     # Ensure that the legacy architectures still work
     vector_length = 3
     nlp = English()

From fdec654e90c6bce59143c85d720134bd87d83e25 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 2 Feb 2023 22:13:38 +0900
Subject: [PATCH 272/504] Cleanup/remove backwards compat overwrite settings
 (#11888)

* Remove backwards-compatible overwrite from Entity Linker

This also adds a docstring about overwrite, since it wasn't present.

* Fix docstring

* Remove backward compat settings in Morphologizer

This also needed a docstring added.

For this component it's less clear what the right overwrite settings
are.

* Remove backward compat from sentencizer

This was simple

* Remove backward compat from senter

Another simple one

* Remove backward compat setting from tagger

* Add docstrings

* Update spacy/pipeline/morphologizer.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update docs

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/pipeline/entity_linker.py    | 11 +++--------
 spacy/pipeline/morphologizer.pyx   | 11 ++++-------
 spacy/pipeline/sentencizer.pyx     |  7 ++-----
 spacy/pipeline/senter.pyx          |  5 ++---
 spacy/pipeline/tagger.pyx          |  6 ++----
 website/docs/api/entitylinker.mdx  |  2 +-
 website/docs/api/morphologizer.mdx |  2 +-
 7 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index acf1add0599..5fef27a44c5 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -42,9 +42,6 @@
 
 KNOWLEDGE_BASE_IDS = "kb_ids"
 
-# See #9050
-BACKWARD_OVERWRITE = True
-
 default_model_config = """
 [model]
 @architectures = "spacy.EntityLinker.v2"
@@ -75,8 +72,7 @@
         "entity_vector_length": 64,
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
         "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
-        "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
-        "overwrite": True,
+        "overwrite": False,
         "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
         "use_gold_ents": True,
         "candidates_batch_size": 1,
@@ -210,8 +206,7 @@ def __init__(
         get_candidates_batch: Callable[
             [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
         ],
-        generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
-        overwrite: bool = BACKWARD_OVERWRITE,
+        overwrite: bool = False,
         scorer: Optional[Callable] = entity_linker_score,
         use_gold_ents: bool,
         candidates_batch_size: int,
@@ -235,7 +230,7 @@ def __init__(
             Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
             Iterable[Candidate]]
             ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
-        generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
+        overwrite (bool): Whether to overwrite existing non-empty annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
         use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
             component must provide entity annotations.
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index d3068bdffdd..5e7d0720a40 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -28,10 +28,6 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .tagger import Tagger
 
-# See #9050
-BACKWARD_OVERWRITE = True
-BACKWARD_EXTEND = False
-
 default_model_config = """
 [model]
 @architectures = "spacy.Tagger.v2"
@@ -113,9 +109,8 @@ class Morphologizer(Tagger):
         model: Model,
         name: str = "morphologizer",
         *,
-        overwrite: bool = BACKWARD_OVERWRITE,
-        extend: bool = BACKWARD_EXTEND,
-        label_smoothing: float = 0.0,
+        overwrite: bool = False,
+        extend: bool = False,
         scorer: Optional[Callable] = morphologizer_score,
         save_activations: bool = False,
     ):
@@ -125,6 +120,8 @@ class Morphologizer(Tagger):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
+        overwrite (bool): Whether to overwrite existing annotations.
+        extend (bool): Whether to extend existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attributes "pos" and "morph" and
             Scorer.score_token_attr_per_feat for the attribute "morph".
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 08ba9d989c1..02b92e87812 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -10,10 +10,6 @@ from ..language import Language
 from .pipe import Pipe
 from .senter import senter_score
 
-# see #9050
-BACKWARD_OVERWRITE = False
-
-
 @Language.factory(
     "sentencizer",
     assigns=["token.is_sent_start", "doc.sents"],
@@ -55,13 +51,14 @@ class Sentencizer(Pipe):
         name="sentencizer",
         *,
         punct_chars=None,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
         scorer=senter_score,
     ):
         """Initialize the sentencizer.
 
         punct_chars (list): Punctuation characters to split on. Will be
             serialized with the nlp object.
+        overwrite (bool): Whether to overwrite existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_spans for the attribute "sents".
 
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 185430c122c..ba45df28400 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -20,8 +20,6 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .tagger import Tagger
 
-# See #9050
-BACKWARD_OVERWRITE = False
 
 default_model_config = """
 [model]
@@ -85,7 +83,7 @@ class SentenceRecognizer(Tagger):
         model,
         name="senter",
         *,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
         scorer=senter_score,
         save_activations: bool = False,
     ):
@@ -95,6 +93,7 @@ class SentenceRecognizer(Tagger):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
+        overwrite (bool): Whether to overwrite existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_spans for the attribute "sents".
         save_activations (bool): save model activations in Doc when annotating.
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index a8a89332bd4..8740058174a 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -26,9 +26,6 @@ from .trainable_pipe import TrainablePipe
 
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
-# See #9050
-BACKWARD_OVERWRITE = False
-
 default_model_config = """
 [model]
 @architectures = "spacy.Tagger.v2"
@@ -98,7 +95,7 @@ class Tagger(TrainablePipe):
         model,
         name="tagger",
         *,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
         scorer=tagger_score,
         neg_prefix="!",
         save_activations: bool = False,
@@ -109,6 +106,7 @@ class Tagger(TrainablePipe):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
+        overwrite (bool): Whether to overwrite existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attribute "tag".
         save_activations (bool): save model activations in Doc when annotating.
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index 238b62a2e6d..12b2f6bef1d 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -63,7 +63,7 @@ architectures and their arguments and hyperparameters.
 | `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
 | `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
 | `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                    |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                   |
 | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
 | `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 4660ec312fa..9514bc773b9 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters.
 | Setting                                         | Description                                                                                                                                                                                                                                                            |
 | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `model`                                         | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                 |
 | `extend` <Tag variant="new">3.2</Tag>           | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
 | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                       |

From efce0cfb61516050c2bd1cf67e8038b15b578192 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 3 Feb 2023 15:22:25 +0100
Subject: [PATCH 273/504] `Language.update`: ensure that tok2vec gets updated
 (#12136)

* `Language.update`: ensure that tok2vec gets updated

The components in a pipeline can be updated independently. However,
tok2vec implementations are an exception to this, since they depend on
listeners for their gradients. The update method of a tok2vec
implementation computes the tok2vec forward and passes this along with a
backprop function to the listeners. This backprop function accumulates
gradients for all the listeners. There are two ways in which the
accumulated gradients can be used to update the tok2vec weights:

1. Call the `finish_update` method of tok2vec *after* the `update`
   method is called on all of the pipes that use a tok2vec listener.
2. Pass an optimizer to the `update` method of tok2vec. In this
   case, tok2vec will give the last listener a special backprop
   function that calls `finish_update` on the tok2vec.

Unfortunately, `Language.update` did neither of these. Instead, it
immediately called `finish_update` on every pipe after `update`. As a
result, the tok2vec weights are updated when no gradients have been
accumulated from listeners yet. And the gradients of the listeners are
only used in the next call to `Language.update` (when `finish_update` is
called on tok2vec again).

This change fixes this issue by passing the optimizer to the `update`
method of trainable pipes, leading to use of the second strategy
outlined above.

The main updating loop in `Language.update` is also simplified by using
the `TrainableComponent` protocol consistently.

* Train loop: `sgd` is `Optional[Optimizer]`, do not pass false

* Language.update: call pipe finish_update after all pipe updates

This does correct and fast updates if multiple components update the
same parameters.

* Add comment why we moved `finish_update` to a separate loop
---
 spacy/language.py                             | 28 ++++---
 .../pipeline/test_annotates_on_update.py      | 12 ++-
 spacy/tests/test_language.py                  | 73 ++++++++++++++++++-
 spacy/training/loop.py                        |  2 +-
 4 files changed, 99 insertions(+), 16 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index cb9652e97bf..51189ab371a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1280,17 +1280,12 @@ def update(
             component_cfg[name].setdefault("drop", drop)
             pipe_kwargs[name].setdefault("batch_size", self.batch_size)
         for name, proc in self.pipeline:
-            # ignore statements are used here because mypy ignores hasattr
-            if name not in exclude and hasattr(proc, "update"):
-                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])  # type: ignore
-            if sgd not in (None, False):
-                if (
-                    name not in exclude
-                    and isinstance(proc, ty.TrainableComponent)
-                    and proc.is_trainable
-                    and proc.model not in (True, False, None)
-                ):
-                    proc.finish_update(sgd)
+            if (
+                name not in exclude
+                and isinstance(proc, ty.TrainableComponent)
+                and proc.is_trainable
+            ):
+                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
             if name in annotates:
                 for doc, eg in zip(
                     _pipe(
@@ -1303,6 +1298,17 @@ def update(
                     examples,
                 ):
                     eg.predicted = doc
+        # Only finish the update after all component updates are done. Some
+        # components may share weights (such as tok2vec) and we only want
+        # to apply weight updates after all gradients are accumulated.
+        for name, proc in self.pipeline:
+            if (
+                name not in exclude
+                and isinstance(proc, ty.TrainableComponent)
+                and proc.is_trainable
+            ):
+                proc.finish_update(sgd)
+
         return losses
 
     def rehearse(
diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py
index d4feebd3045..f13a0ae5a3c 100644
--- a/spacy/tests/pipeline/test_annotates_on_update.py
+++ b/spacy/tests/pipeline/test_annotates_on_update.py
@@ -55,9 +55,11 @@ def assert_sents(nlp, name):
         return AssertSents(name)
 
     class AssertSents:
+        model = None
+        is_trainable = True
+
         def __init__(self, name, **cfg):
             self.name = name
-            pass
 
         def __call__(self, doc):
             if not doc.has_annotation("SENT_START"):
@@ -65,10 +67,16 @@ def __call__(self, doc):
             return doc
 
         def update(self, examples, *, drop=0.0, sgd=None, losses=None):
+            losses.setdefault(self.name, 0.0)
+
             for example in examples:
                 if not example.predicted.has_annotation("SENT_START"):
                     raise ValueError("No sents")
-            return {}
+
+            return losses
+
+        def finish_update(self, sgd=None):
+            pass
 
     nlp = English()
     nlp.add_pipe("sentencizer")
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index b419d77b51d..88ef3d434c0 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -13,8 +13,12 @@
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, Span
 from spacy.training import Example
-from spacy.util import find_matching_language, ignore_error, raise_error, registry
-from spacy.vocab import Vocab
+from spacy.lang.en import English
+from spacy.lang.de import German
+from spacy.util import registry, ignore_error, raise_error, find_matching_language
+from spacy.util import load_model_from_config
+import spacy
+from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
 
 from .util import add_vecs_to_vocab, assert_docs_equal
 
@@ -27,6 +31,51 @@
 except ImportError:
     pass
 
+TAGGER_CFG_STRING = """
+    [nlp]
+    lang = "en"
+    pipeline = ["tok2vec","tagger"]
+
+    [components]
+
+    [components.tagger]
+    factory = "tagger"
+
+    [components.tagger.model]
+    @architectures = "spacy.Tagger.v2"
+    nO = null
+
+    [components.tagger.model.tok2vec]
+    @architectures = "spacy.Tok2VecListener.v1"
+    width = ${components.tok2vec.model.encode.width}
+
+    [components.tok2vec]
+    factory = "tok2vec"
+
+    [components.tok2vec.model]
+    @architectures = "spacy.Tok2Vec.v2"
+
+    [components.tok2vec.model.embed]
+    @architectures = "spacy.MultiHashEmbed.v1"
+    width = ${components.tok2vec.model.encode.width}
+    rows = [2000, 1000, 1000, 1000]
+    attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+    include_static_vectors = false
+
+    [components.tok2vec.model.encode]
+    @architectures = "spacy.MaxoutWindowEncoder.v2"
+    width = 96
+    depth = 4
+    window_size = 1
+    maxout_pieces = 3
+    """
+
+
+TAGGER_TRAIN_DATA = [
+    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
 
 TAGGER_TRAIN_DATA = [
     ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
@@ -93,6 +142,26 @@ def test_language_update(nlp):
         example = Example.from_dict(doc, wrongkeyannots)
 
 
+def test_language_update_updates():
+    config = Config().from_str(TAGGER_CFG_STRING)
+    nlp = load_model_from_config(config, auto_fill=True, validate=True)
+
+    train_examples = []
+    for t in TAGGER_TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+    nlp.update(train_examples, sgd=optimizer)
+    docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+
+    xp = get_array_module(docs_after_update[0].tensor)
+    assert xp.any(
+        xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
+    )
+
+
 def test_language_evaluate(nlp):
     text = "hello world"
     annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 58d5b06786f..e6b3451cd73 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -221,7 +221,7 @@ def train_while_improving(
                 subbatch,
                 drop=dropout,
                 losses=losses,
-                sgd=False,  # type: ignore[arg-type]
+                sgd=None,
                 exclude=exclude,
                 annotates=annotating_components,
             )

From 96620ff50a0d38909913000be9230aae5b1a2d73 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:28:34 +0100
Subject: [PATCH 274/504] Use the same tuple in Span cmp and hash (#12251)

---
 spacy/tokens/span.pyx | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index c4e4c3e5d39..a22fdccad36 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -133,9 +133,8 @@ cdef class Span:
             else:
                 return True
 
-        cdef SpanC* span_c = self.span_c()
-        cdef SpanC* other_span_c = other.span_c()
-
+        self_tuple = self._cmp_tuple()
+        other_tuple = other._cmp_tuple()
         # <
         if op == 0:
             return span_c.start_char < other_span_c.start_char
@@ -170,8 +169,20 @@ cdef class Span:
             return span_c.start_char >= other_span_c.start_char
 
     def __hash__(self):
+        return hash(self._cmp_tuple())
+
+    def _cmp_tuple(self):
         cdef SpanC* span_c = self.span_c()
-        return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id))
+        return (
+            span_c.start_char,
+            span_c.end_char,
+            span_c.start,
+            span_c.end,
+            span_c.label,
+            span_c.kb_id,
+            span_c.id,
+            self.doc,
+        )
 
     def __len__(self):
         """Get the number of tokens in the span.

From c6098a3dd238863d430b53e3be8842405b11ad4e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:37:42 +0100
Subject: [PATCH 275/504] Remove names for vectors (#12243)

* Remove names for vectors

Named vectors are basically a carry-over from v2 and aren't used for
anything.

* Format
---
 spacy/cli/init_pipeline.py                    |  2 --
 spacy/language.py                             | 14 +----------
 .../serialize/test_serialize_pipeline.py      |  2 +-
 spacy/tests/vocab_vectors/test_vectors.py     | 13 +++++-----
 spacy/training/initialize.py                  |  7 ------
 spacy/vectors.pyx                             |  5 +---
 spacy/vocab.pyi                               |  4 ++--
 spacy/vocab.pyx                               | 24 ++++++-------------
 website/docs/api/cli.mdx                      |  6 ++---
 website/docs/api/vectors.mdx                  |  1 -
 website/docs/api/vocab.mdx                    |  1 -
 11 files changed, 20 insertions(+), 59 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 0ff39d2145b..1a044dedbc9 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -21,7 +21,6 @@ def init_vectors_cli(
     prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
     truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
     mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
-    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
     attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
@@ -45,7 +44,6 @@ def init_vectors_cli(
         vectors_loc,
         truncate=truncate,
         prune=prune,
-        name=name,
         mode=mode,
         attr=attr,
     )
diff --git a/spacy/language.py b/spacy/language.py
index 51189ab371a..e8a7d719ef2 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -194,12 +194,7 @@ def __init__(
         if not isinstance(vocab, Vocab) and vocab is not True:
             raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
         if vocab is True:
-            vectors_name = meta.get("vectors", {}).get("name")
-            vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
-            if not create_vectors:
-                vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
-                create_vectors = registry.resolve(vectors_cfg)["vectors"]
-            vocab.vectors = create_vectors(vocab)
+            vocab = create_vocab(self.lang, self.Defaults)
         else:
             if (self.lang and vocab.lang) and (self.lang != vocab.lang):
                 raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@@ -253,7 +248,6 @@ def meta(self) -> Dict[str, Any]:
             "width": self.vocab.vectors_length,
             "vectors": len(self.vocab.vectors),
             "keys": self.vocab.vectors.n_keys,
-            "name": self.vocab.vectors.name,
             "mode": self.vocab.vectors.mode,
         }
         self._meta["labels"] = dict(self.pipe_labels)
@@ -2275,9 +2269,6 @@ def deserialize_meta(path: Path) -> None:
             if path.exists():
                 data = srsly.read_json(path)
                 self.meta.update(data)
-                # self.meta always overrides meta["vectors"] with the metadata
-                # from self.vocab.vectors, so set the name directly
-                self.vocab.vectors.name = data.get("vectors", {}).get("name")
 
         def deserialize_vocab(path: Path) -> None:
             if path.exists():
@@ -2346,9 +2337,6 @@ def from_bytes(
         def deserialize_meta(b):
             data = srsly.json_loads(b)
             self.meta.update(data)
-            # self.meta always overrides meta["vectors"] with the metadata
-            # from self.vocab.vectors, so set the name directly
-            self.vocab.vectors.name = data.get("vectors", {}).get("name")
 
         deserializers: Dict[str, Callable[[bytes], Any]] = {}
         deserializers["config.cfg"] = lambda b: self.config.from_bytes(
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 8170488f758..39fbbf58217 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -181,7 +181,7 @@ def test_issue4042_bug2():
 @pytest.mark.issue(4725)
 def test_issue4725_1():
     """Ensure the pickling of the NER goes well"""
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    vocab = Vocab()
     nlp = English(vocab=vocab)
     config = {
         "update_with_oracle_cut_size": 111,
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 7172913141c..16574656bfb 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -84,7 +84,7 @@ def test_issue1539():
 @pytest.mark.issue(1807)
 def test_issue1807():
     """Test vocab.set_vector also adds the word to the vocab."""
-    vocab = Vocab(vectors_name="test_issue1807")
+    vocab = Vocab()
     assert "hello" not in vocab
     vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
     assert "hello" in vocab
@@ -94,13 +94,12 @@ def test_issue1807():
 def test_issue2871():
     """Test that vectors recover the correct key for spaCy reserved words."""
     words = ["dog", "cat", "SUFFIX"]
-    vocab = Vocab(vectors_name="test_issue2871")
+    vocab = Vocab()
     vocab.vectors.resize(shape=(3, 10))
     vector_data = numpy.zeros((3, 10), dtype="f")
     for word in words:
         _ = vocab[word]  # noqa: F841
         vocab.set_vector(word, vector_data[0])
-    vocab.vectors.name = "dummy_vectors"
     assert vocab["dog"].rank == 0
     assert vocab["cat"].rank == 1
     assert vocab["SUFFIX"].rank == 2
@@ -125,7 +124,7 @@ def test_issue4725_2():
         # ensures that this runs correctly and doesn't hang or crash because of the global vectors
         # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
         # or because of issues with pickling the NER (cf test_issue4725_1)
-        vocab = Vocab(vectors_name="test_vocab_add_vector")
+        vocab = Vocab()
         data = numpy.ndarray((5, 3), dtype="f")
         data[0] = 1.0
         data[1] = 2.0
@@ -340,7 +339,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2):
 
 
 def test_vocab_add_vector():
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    vocab = Vocab()
     data = OPS.xp.ndarray((5, 3), dtype="f")
     data[0] = 1.0
     data[1] = 2.0
@@ -356,7 +355,7 @@ def test_vocab_add_vector():
 
 
 def test_vocab_prune_vectors():
-    vocab = Vocab(vectors_name="test_vocab_prune_vectors")
+    vocab = Vocab()
     _ = vocab["cat"]  # noqa: F841
     _ = vocab["dog"]  # noqa: F841
     _ = vocab["kitten"]  # noqa: F841
@@ -406,7 +405,7 @@ def test_vectors_serialize():
 
 
 def test_vector_is_oov():
-    vocab = Vocab(vectors_name="test_vocab_is_oov")
+    vocab = Vocab()
     data = OPS.xp.ndarray((5, 3), dtype="f")
     data[0] = 1.0
     data[1] = 2.0
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 0621702214c..191821e786e 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -214,7 +214,6 @@ def convert_vectors(
     *,
     truncate: int,
     prune: int,
-    name: Optional[str] = None,
     mode: str = VectorsMode.default,
     attr: str = "ORTH",
 ) -> None:
@@ -262,12 +261,6 @@ def convert_vectors(
                     attr=attr,
                 )
                 nlp.vocab.deduplicate_vectors()
-    if name is None:
-        # TODO: Is this correct? Does this matter?
-        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
-    else:
-        nlp.vocab.vectors.name = name
-    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
     if prune >= 1 and mode != VectorsMode.floret:
         nlp.vocab.prune_vectors(prune)
 
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 6ff99bb59eb..e16efd2738d 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -124,7 +124,6 @@ cdef class Vectors(BaseVectors):
     DOCS: https://spacy.io/api/vectors
     """
     cdef public object strings
-    cdef public object name
     cdef readonly object mode
     cdef public object data
     cdef public object key2row
@@ -137,14 +136,13 @@ cdef class Vectors(BaseVectors):
     cdef readonly unicode eow
     cdef readonly attr_id_t attr
 
-    def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">", attr="ORTH"):
+    def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
         """Create a new vector store.
 
         strings (StringStore): The string store.
         shape (tuple): Size of the table, as (# entries, # columns)
         data (numpy.ndarray or cupy.ndarray): The vector data.
         keys (iterable): A sequence of keys, aligned with the data.
-        name (str): A name to identify the vectors table.
         mode (str): Vectors mode: "default" or "floret" (default: "default").
         minn (int): The floret char ngram minn (default: 0).
         maxn (int): The floret char ngram maxn (default: 0).
@@ -160,7 +158,6 @@ cdef class Vectors(BaseVectors):
         self.strings = strings
         if self.strings is None:
             self.strings = StringStore()
-        self.name = name
         if mode not in Mode.values():
             raise ValueError(
                 Errors.E202.format(
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
index 7f5f23e7847..7fbb9764f10 100644
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@@ -12,7 +12,8 @@ from .tokens import Doc, Span
 from .vectors import Vectors
 
 def create_vocab(
-    lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
+    lang: Optional[str],
+    defaults: Any,
 ) -> Vocab: ...
 
 class Vocab:
@@ -29,7 +30,6 @@ class Vocab:
         strings: Optional[Union[List[str], StringStore]] = ...,
         lookups: Optional[Lookups] = ...,
         oov_prob: float = ...,
-        vectors_name: Optional[str] = ...,
         writing_system: Dict[str, Any] = ...,
         get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
     ) -> None: ...
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index dea3a696e10..8fbd1c5d3dc 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -20,7 +20,7 @@ from .vectors import Mode as VectorsMode
 from .vectors import Vectors
 
 
-def create_vocab(lang, defaults, vectors_name=None):
+def create_vocab(lang, defaults):
     # If the spacy-lookups-data package is installed, we pre-populate the lookups
     # with lexeme data, if available
     lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
@@ -36,7 +36,6 @@ def create_vocab(lang, defaults, vectors_name=None):
         lex_attr_getters=lex_attrs,
         writing_system=defaults.writing_system,
         get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
-        vectors_name=vectors_name,
     )
 
 
@@ -47,17 +46,9 @@ cdef class Vocab:
 
     DOCS: https://spacy.io/api/vocab
     """
-    def __init__(
-        self,
-        lex_attr_getters=None,
-        strings=tuple(),
-        lookups=None,
-        oov_prob=-20.,
-        vectors_name=None,
-        writing_system={},  # no-cython-lint
-        get_noun_chunks=None,
-        **deprecated_kwargs
-    ):
+    def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
+                 oov_prob=-20., writing_system={}, get_noun_chunks=None,
+                 **deprecated_kwargs):
         """Create the vocabulary.
 
         lex_attr_getters (dict): A dictionary mapping attribute IDs to
@@ -66,7 +57,6 @@ cdef class Vocab:
             vice versa.
         lookups (Lookups): Container for large lookup tables and dictionaries.
         oov_prob (float): Default OOV probability.
-        vectors_name (str): Optional name to identify the vectors table.
         get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]):
             A function that yields base noun phrases used for Doc.noun_chunks.
         """
@@ -83,7 +73,7 @@ cdef class Vocab:
                 _ = self[string]
         self.lex_attr_getters = lex_attr_getters
         self.morphology = Morphology(self.strings)
-        self.vectors = Vectors(strings=self.strings, name=vectors_name)
+        self.vectors = Vectors(strings=self.strings)
         self.lookups = lookups
         self.writing_system = writing_system
         self.get_noun_chunks = get_noun_chunks
@@ -321,7 +311,7 @@ cdef class Vocab:
             for key, row in self.vectors.key2row.items()
         }
         # replace vectors with deduplicated version
-        self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name)
+        self.vectors = Vectors(strings=self.strings, data=data)
         for key, row in key2row.items():
             self.vectors.add(key, row=row)
 
@@ -378,7 +368,7 @@ cdef class Vocab:
         keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
         keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
         toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
-        self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name)
+        self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row])
         syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
         syn_keys = ops.to_numpy(syn_keys)
         remap = {}
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index acc2ce1caa2..3f91e1ff71e 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -225,7 +225,7 @@ This functionality was previously available as part of the command `init-model`.
 </Infobox>
 
 ```bash
-$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
+$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--verbose]
 ```
 
 | Name               | Description                                                                                                                                                                                                                                                         |
@@ -235,9 +235,7 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
 | `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
 | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
 | `--prune`, `-p`    | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
-| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~str \(option)~~                                                                                                                                                |
-| `--attr`, `-a`     | Token attribute to use for vectors, e.g. `LOWER` or `NORM`) Defaults to `ORTH`. ~~str \(option)~~                                                                                                                                                                   |
-| `--name`, `-n`     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
+| `--mode`, `-m`     | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~                                                                                                                                      |
 | `--verbose`, `-V`  | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                                                                      |
 | `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
 | **CREATES**        | A spaCy pipeline directory containing the vocab and vectors.                                                                                                                                                                                                        |
diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx
index 0e92eb12ba4..39b309e1377 100644
--- a/website/docs/api/vectors.mdx
+++ b/website/docs/api/vectors.mdx
@@ -52,7 +52,6 @@ modified later.
 | `shape`                                   | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
 | `data`                                    | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~                                                                                                                              |
 | `keys`                                    | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~                                                                                                                |
-| `name`                                    | A name to identify the vectors table. ~~str~~                                                                                                                                          |
 | `mode` <Tag variant="new">3.2</Tag>       | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~                                                                         |
 | `minn` <Tag variant="new">3.2</Tag>       | The floret char ngram minn (default: `0`). ~~int~~                                                                                                                                     |
 | `maxn` <Tag variant="new">3.2</Tag>       | The floret char ngram maxn (default: `0`). ~~int~~                                                                                                                                     |
diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx
index 57618397da5..36369c78427 100644
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@@ -34,7 +34,6 @@ Create the vocabulary.
 | `strings`          | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~                           |
 | `lookups`          | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~                                      |
 | `oov_prob`         | The default OOV probability. Defaults to `-20.0`. ~~float~~                                                                                                             |
-| `vectors_name`     | A name to identify the vectors table. ~~str~~                                                                                                                           |
 | `writing_system`   | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~                          |
 | `get_noun_chunks`  | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
 

From ef41e883935a10c32d4f64b8f7aae1caf777a85a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:46:07 +0100
Subject: [PATCH 276/504] Remove unused Span.char_span(id=) (#12250)

---
 spacy/tokens/span.pyi     | 1 -
 spacy/tokens/span.pyx     | 3 +--
 website/docs/api/span.mdx | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index ae4a6209e7e..373b4ed1afe 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -103,7 +103,6 @@ class Span:
         label: Union[int, str] = ...,
         kb_id: Union[int, str] = ...,
         vector: Optional[Floats1d] = ...,
-        id: Union[int, str] = ...,
         alignment_mode: str = ...,
         span_id: Union[int, str] = ...,
     ) -> Span: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index a22fdccad36..23d5033558a 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -694,7 +694,7 @@ cdef class Span:
         else:
             return self.doc[root]
 
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
         """Create a `Span` object from the slice `span.text[start : end]`.
 
         start (int): The index of the first character of the span.
@@ -704,7 +704,6 @@ cdef class Span:
         kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a named entity.
         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
             the span.
-        id (Union[int, str]): Unused.
         alignment_mode (str): How character indices are aligned to token
             boundaries. Options: "strict" (character indices must be aligned
             with token boundaries), "contract" (span of all tokens completely
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index 1774a298ff2..fa5791c405e 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -193,7 +193,6 @@ the character indices don't map to a valid span.
 | `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
 | `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
 | `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
-| `id`                                            | Unused. ~~Union[int, str]~~                                                                                                                                                                                                                                                  |
 | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
 | `span_id` <Tag variant="new">3.5.1</Tag>        | An identifier to associate with the span. ~~Union[int, str]~~                                                                                                                                                                                                                |
 | **RETURNS**                                     | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |

From af9815755fe70f9ff27e4226882181456af7bd31 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 15 Feb 2023 12:34:33 +0100
Subject: [PATCH 277/504] Make Span.char_span optional args keyword-only
 (#12257)

* Make Span.char_span optional args keyword-only

* Make kb_id and following kw-only

* Format
---
 spacy/tokens/doc.pyi      | 3 ++-
 spacy/tokens/doc.pyx      | 4 ++--
 spacy/tokens/span.pyi     | 1 +
 spacy/tokens/span.pyx     | 6 +++---
 website/docs/api/doc.mdx  | 1 +
 website/docs/api/span.mdx | 5 +++--
 6 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 97c3f69f430..11f8a1c5eb8 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -124,6 +124,7 @@ class Doc:
         start_idx: int,
         end_idx: int,
         label: Union[int, str] = ...,
+        *,
         kb_id: Union[int, str] = ...,
         vector: Optional[Floats1d] = ...,
         alignment_mode: str = ...,
@@ -151,7 +152,7 @@ class Doc:
         blocked: Optional[List[Span]] = ...,
         missing: Optional[List[Span]] = ...,
         outside: Optional[List[Span]] = ...,
-        default: str = ...
+        default: str = ...,
     ) -> None: ...
     @property
     def noun_chunks(self) -> Iterator[Span]: ...
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 2be827b61c7..74889d95743 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -556,7 +556,7 @@ cdef class Doc:
     def doc(self):
         return self
 
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
         """Create a `Span` object from the slice
         `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
         created.
@@ -1650,7 +1650,7 @@ cdef class Doc:
         for span_group in doc_json.get("spans", {}):
             spans = []
             for span in doc_json["spans"][span_group]:
-                char_span = self.char_span(span["start"], span["end"], span["label"], span["kb_id"])
+                char_span = self.char_span(span["start"], span["end"], span["label"], kb_id=span["kb_id"])
                 if char_span is None:
                     raise ValueError(Errors.E1039.format(obj="span", start=span["start"], end=span["end"]))
                 spans.append(char_span)
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 373b4ed1afe..3c85542bb3d 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -101,6 +101,7 @@ class Span:
         start_idx: int,
         end_idx: int,
         label: Union[int, str] = ...,
+        *,
         kb_id: Union[int, str] = ...,
         vector: Optional[Floats1d] = ...,
         alignment_mode: str = ...,
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 23d5033558a..1f9db825778 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -694,11 +694,11 @@ cdef class Span:
         else:
             return self.doc[root]
 
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
         """Create a `Span` object from the slice `span.text[start : end]`.
 
-        start (int): The index of the first character of the span.
-        end (int): The index of the first character after the span.
+        start_idx (int): The index of the first character of the span.
+        end_idx (int): The index of the first character after the span.
         label (Union[int, str]): A label to attach to the Span, e.g. for
             named entities.
         kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a named entity.
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index 28757cbc45f..f53e209afc8 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -214,6 +214,7 @@ alignment mode `"strict".
 | `start`                                  | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
 | `end`                                    | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
 | `label`                                  | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| _keyword-only_                           |                                                                                                                                                                                                                                                                              |
 | `kb_id`                                  | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
 | `vector`                                 | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
 | `alignment_mode`                         | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index fa5791c405e..ae7ef7203b6 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -188,9 +188,10 @@ the character indices don't map to a valid span.
 
 | Name                                            | Description                                                                                                                                                                                                                                                                  |
 | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`                                         | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
-| `end`                                           | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
+| `start_idx`                                     | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
+| `end_idx`                                       | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
 | `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| _keyword-only_                                  |                                                                                                                                                                                                                                                                              |
 | `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
 | `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
 | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |

From 6ebde7e95d934d1f54fec6185b718f01f97e4a0b Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 16 Feb 2023 19:08:55 +0900
Subject: [PATCH 278/504] Use tempfile.TemporaryDirectory (#12285)

---
 spacy/util.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index 8c402a74ce9..7448da8ded0 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1056,22 +1056,11 @@ def make_tempdir() -> Generator[Path, None, None]:
     its contents at the end of the with block.
     YIELDS (Path): The path of the temp directory.
     """
-    d = Path(tempfile.mkdtemp())
-    yield d
-
-    # On Windows, git clones use read-only files, which cause permission errors
-    # when being deleted. This forcibly fixes permissions.
-    def force_remove(rmfunc, path, ex):
-        os.chmod(path, stat.S_IWRITE)
-        rmfunc(path)
-
     try:
-        if sys.version_info >= (3, 12):
-            shutil.rmtree(str(d), onexc=force_remove)
-        else:
-            shutil.rmtree(str(d), onerror=force_remove)
+        with tempfile.TemporaryDirectory() as td:
+            yield Path(td)
     except PermissionError as e:
-        warnings.warn(Warnings.W091.format(dir=d, msg=e))
+        warnings.warn(Warnings.W091.format(dir=td, msg=e))
 
 
 def is_in_jupyter() -> bool:

From 02930f80059f5ca30b47578dc272a347b122d7f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 21 Feb 2023 15:47:18 +0100
Subject: [PATCH 279/504] Reimplement distillation with oracle cut size
 (#12214)

* Improve the correctness of _parse_patch

* If there are no more actions, do not attempt to make further
  transitions, even if not all states are final.
* Assert that the number of actions for a step is the same as
  the number of states.

* Reimplement distillation with oracle cut size

The code for distillation with an oracle cut size was not reimplemented
after the parser refactor. We did not notice, because we did not have
tests for this functionality. This change brings back the functionality
and adds this to the parser tests.

* Rename states2actions to _states_to_actions for consistency

* Test distillation max cuts in NER

* Mark parser/NER tests as slow

* Typo

* Fix invariant in _states_diff_to_actions

* Rename _init_batch -> _init_batch_from_teacher

* Ninja edit the ninja edit

* Check that we raise an exception when we pass the incorrect number or actions

* Remove unnecessary get

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Write out condition more explicitly

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/ml/tb_framework.pyx            |   4 +-
 spacy/pipeline/transition_parser.pyx | 101 +++++++++++++++++++++------
 spacy/tests/parser/test_model.py     |  61 ++++++++++++++++
 spacy/tests/parser/test_ner.py       |   5 +-
 spacy/tests/parser/test_parse.py     |   5 +-
 5 files changed, 152 insertions(+), 24 deletions(-)
 create mode 100644 spacy/tests/parser/test_model.py

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 79be13b00bd..9b2114900d3 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
     cdef np.ndarray step_actions
 
     scores = []
-    while sizes.states >= 1:
+    while sizes.states >= 1 and (actions is None or len(actions) > 0):
         step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
         step_actions = actions[0] if actions is not None else None
+        assert step_actions is None or step_actions.size == sizes.states, \
+            f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
         with nogil:
             _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
             if actions is None:
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 6a50dbacaeb..ef2e3314e85 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -45,6 +45,11 @@ from ..errors import Errors
 from ..training import validate_examples, validate_get_examples
 from ._parser_internals import _beam_utils
 
+# TODO: Remove when we switch to Cython 3.
+cdef extern from "<algorithm>" namespace "std" nogil:
+    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
+
+
 NUMPY_OPS = NumpyOps()
 
 
@@ -262,8 +267,8 @@ class Parser(TrainablePipe):
             # batch uniform length. Since we do not have a gold standard
             # sequence, we use the teacher's predictions as the gold
             # standard.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states = self._init_batch(teacher_pipe, student_docs, max_moves)
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
+            states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
@@ -274,12 +279,12 @@ class Parser(TrainablePipe):
         # gradients of the student's transition distributions relative to the
         # teacher's distributions.
 
-        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
-            max_moves=max_moves)
+        student_inputs = TransitionModelInputs(docs=student_docs,
+            states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
+        actions = _states_diff_to_actions(states, student_states)
         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-            moves=self.moves, actions=actions)
+            states=states, moves=teacher_pipe.moves, actions=actions)
         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@@ -528,7 +533,7 @@ class Parser(TrainablePipe):
         set_dropout_rate(self.model, 0.0)
         student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
+        actions = _states_to_actions(student_states)
         teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
         _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
 
@@ -648,7 +653,7 @@ class Parser(TrainablePipe):
                     raise ValueError(Errors.E149) from None
         return self
 
-    def _init_batch(self, teacher_step_model, docs, max_length):
+    def _init_batch_from_teacher(self, teacher_pipe, docs, max_length):
         """Make a square batch of length equal to the shortest transition
         sequence or a cap. A long
         doc will get multiple states. Let's say we have a doc of length 2*N,
@@ -657,10 +662,12 @@ class Parser(TrainablePipe):
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
         cdef:
-            StateClass start_state
             StateClass state
-            Transition action
-        all_states = self.moves.init_batch(docs)
+            TransitionSystem moves = teacher_pipe.moves
+
+        # Start with the same heuristic as in supervised training: exclude
+        # docs that are within the maximum length.
+        all_states = moves.init_batch(docs)
         states = []
         to_cut = []
         for state, doc in zip(all_states, docs):
@@ -669,18 +676,28 @@ class Parser(TrainablePipe):
                     states.append(state)
                 else:
                     to_cut.append(state)
+
+        if not to_cut:
+            return states
+
+        # Parse the states that are too long with the teacher's parsing model.
+        teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
+            states=[state.copy() for state in to_cut])
+        (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs)
+
+        # Step through the teacher's actions and store every state after
+        # each multiple of max_length.
+        teacher_actions = _states_to_actions(teacher_states)
         while to_cut:
             states.extend(state.copy() for state in to_cut)
-            # Move states forward max_length actions.
-            length = 0
-            while to_cut and length < max_length:
-                teacher_scores = teacher_step_model.predict(to_cut)
-                self.transition_states(to_cut, teacher_scores)
-                # States that are completed do not need further cutting.
-                to_cut = [state for state in to_cut if not state.is_final()]
-                length += 1
-        return states
+            for step_actions in teacher_actions[:max_length]:
+                to_cut = moves.apply_actions(to_cut, step_actions)
+            teacher_actions = teacher_actions[max_length:]
+
+            if len(teacher_actions) < max_length:
+                break
 
+        return states
 
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
@@ -742,7 +759,7 @@ def _change_attrs(model, **kwargs):
             model.attrs[key] = value
 
 
-def states2actions(states: List[StateClass]) -> List[Ints1d]:
+def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
     cdef int step
     cdef StateClass state
     cdef StateC* c_state
@@ -763,3 +780,45 @@ def states2actions(states: List[StateClass]) -> List[Ints1d]:
         actions.append(numpy.array(step_actions, dtype="i"))
 
     return actions
+
+def _states_diff_to_actions(
+    before_states: List[StateClass],
+    after_states: List[StateClass]
+) -> List[Ints1d]:
+    """
+    Return for two sets of states the actions to go from the first set of
+    states to the second set of states. The histories of the first set of
+    states must be a prefix of the second set of states.
+    """
+    cdef StateClass before_state, after_state
+    cdef StateC* c_state_before
+    cdef StateC* c_state_after
+
+    assert len(before_states) == len(after_states)
+
+    # Check invariant: before states histories must be prefixes of after states.
+    for before_state, after_state in zip(before_states, after_states):
+        c_state_before = before_state.c
+        c_state_after = after_state.c
+
+        assert equal(c_state_before.history.begin(), c_state_before.history.end(),
+            c_state_after.history.begin())
+
+    actions = []
+    while True:
+        step = len(actions)
+
+        step_actions = []
+        for before_state, after_state in zip(before_states, after_states):
+            c_state_before = before_state.c
+            c_state_after = after_state.c
+            if step < c_state_after.history.size() - c_state_before.history.size():
+                step_actions.append(c_state_after.history[c_state_before.history.size() + step])
+
+        # We are done if we have exhausted all histories.
+        if len(step_actions) == 0:
+            break
+
+        actions.append(numpy.array(step_actions, dtype="i"))
+
+    return actions
diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py
new file mode 100644
index 00000000000..8c1cf7a9346
--- /dev/null
+++ b/spacy/tests/parser/test_model.py
@@ -0,0 +1,61 @@
+import numpy
+import pytest
+
+from spacy.lang.en import English
+from spacy.ml.tb_framework import TransitionModelInputs
+from spacy.training import Example
+
+TRAIN_DATA = [
+    (
+        "They trade mortgage-backed securities.",
+        {
+            "heads": [1, 1, 4, 4, 5, 1, 1],
+            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
+        },
+    ),
+    (
+        "I like London and Berlin.",
+        {
+            "heads": [1, 1, 1, 2, 2, 1],
+            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
+        },
+    ),
+]
+
+
+@pytest.fixture
+def nlp_parser():
+    nlp = English()
+    parser = nlp.add_pipe("parser")
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        for dep in annotations["deps"]:
+            parser.add_label(dep)
+    nlp.initialize()
+
+    return nlp, parser
+
+
+def test_incorrect_number_of_actions(nlp_parser):
+    nlp, parser = nlp_parser
+    doc = nlp.make_doc("test")
+
+    # Too many actions for the number of docs
+    with pytest.raises(AssertionError):
+        parser.model.predict(
+            TransitionModelInputs(
+                docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
+            )
+        )
+
+    # Too few actions for the number of docs
+    with pytest.raises(AssertionError):
+        parser.model.predict(
+            TransitionModelInputs(
+                docs=[doc, doc],
+                moves=parser.moves,
+                actions=[numpy.array([0], dtype="i")],
+            )
+        )
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index b2c39ae88bc..2c520b7daf6 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -624,7 +624,9 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -642,6 +644,7 @@ def test_distill():
 
     student = English()
     student_ner = student.add_pipe("ner")
+    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index a6e1852514d..4c709932bb1 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -460,7 +460,9 @@ def test_is_distillable():
     assert parser.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []
@@ -478,6 +480,7 @@ def test_distill():
 
     student = English()
     student_parser = student.add_pipe("parser")
+    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
     student_parser.initialize(
         get_examples=lambda: train_examples, labels=teacher_parser.label_data
     )

From a73839cb85b394010d801c8edfe6b17bb64f6755 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 23 Feb 2023 11:36:50 +0100
Subject: [PATCH 280/504] Remove backoff from .vector to .tensor (#12292)

---
 spacy/tokens/doc.pyx                          | 34 +++++++++----------
 spacy/tokens/span.pyx                         |  2 --
 spacy/tokens/token.pyx                        |  6 +---
 .../docs/usage/101/_vectors-similarity.mdx    | 15 ++++----
 4 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 74889d95743..7d4382f0ace 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -253,8 +253,8 @@ cdef class Doc:
             head in the doc. Defaults to None.
         deps (Optional[List[str]]): A list of unicode strings, of the same
             length as words, to assign as token.dep. Defaults to None.
-        sent_starts (Optional[List[Union[bool, int, None]]]): A list of values, 
-            of the same length as words, to assign as token.is_sent_start. Will 
+        sent_starts (Optional[List[Union[bool, int, None]]]): A list of values,
+            of the same length as words, to assign as token.is_sent_start. Will
             be overridden by heads if heads is provided. Defaults to None.
         ents (Optional[List[str]]): A list of unicode strings, of the same
             length as words, as IOB tags to assign as token.ent_iob and
@@ -691,22 +691,20 @@ cdef class Doc:
 
         DOCS: https://spacy.io/api/doc#vector
         """
-        if "vector" in self.user_hooks:
-            return self.user_hooks["vector"](self)
-        if self._vector is not None:
-            return self._vector
-        xp = get_array_module(self.vocab.vectors.data)
-        if not len(self):
-            self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
-            return self._vector
-        elif self.vocab.vectors.size > 0:
-            self._vector = sum(t.vector for t in self) / len(self)
-            return self._vector
-        elif self.tensor.size > 0:
-            self._vector = self.tensor.mean(axis=0)
-            return self._vector
-        else:
-            return xp.zeros((self.vocab.vectors_length,), dtype="float32")
+        def __get__(self):
+            if "vector" in self.user_hooks:
+                return self.user_hooks["vector"](self)
+            if self._vector is not None:
+                return self._vector
+            xp = get_array_module(self.vocab.vectors.data)
+            if not len(self):
+                self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
+                return self._vector
+            elif self.vocab.vectors.size > 0:
+                self._vector = sum(t.vector for t in self) / len(self)
+                return self._vector
+            else:
+                return xp.zeros((self.vocab.vectors_length,), dtype="float32")
 
     @vector.setter
     def vector(self, value):
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 1f9db825778..89a785ef23e 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -561,8 +561,6 @@ cdef class Span:
             return self.doc.user_span_hooks["has_vector"](self)
         elif self.vocab.vectors.size > 0:
             return any(token.has_vector for token in self)
-        elif self.doc.tensor.size > 0:
-            return True
         else:
             return False
 
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 0e192843ae0..a20b1193fab 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -419,9 +419,7 @@ cdef class Token:
         """
         if "has_vector" in self.doc.user_token_hooks:
             return self.doc.user_token_hooks["has_vector"](self)
-        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
-            return True
-        return self.vocab.has_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
+        return self.vocab.has_vector(self.c.lex.orth)
 
     @property
     def vector(self):
@@ -434,8 +432,6 @@ cdef class Token:
         """
         if "vector" in self.doc.user_token_hooks:
             return self.doc.user_token_hooks["vector"](self)
-        if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
-            return self.doc.tensor[self.i]
         else:
             return self.vocab.get_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
 
diff --git a/website/docs/usage/101/_vectors-similarity.mdx b/website/docs/usage/101/_vectors-similarity.mdx
index 6deab926d25..39ee8e48a43 100644
--- a/website/docs/usage/101/_vectors-similarity.mdx
+++ b/website/docs/usage/101/_vectors-similarity.mdx
@@ -22,17 +22,20 @@ array([2.02280000e-01,  -7.66180009e-02,   3.70319992e-01,
 <Infobox title="Important note" variant="warning">
 
 To make them compact and fast, spaCy's small [pipeline packages](/models) (all
-packages that end in `sm`) **don't ship with word vectors**, and only include
-context-sensitive **tensors**. This means you can still use the `similarity()`
-methods to compare documents, spans and tokens – but the result won't be as
-good, and individual tokens won't have any vectors assigned. So in order to use
-_real_ word vectors, you need to download a larger pipeline package:
+packages that end in `sm`) **don't ship with word vectors**. In order to use
+`similarity()`, you need to download a larger pipeline package that includes
+vectors:
 
 ```diff
 - python -m spacy download en_core_web_sm
-+ python -m spacy download en_core_web_lg
++ python -m spacy download en_core_web_md
 ```
 
+In spaCy v3 and earlier, small pipeline packages supported `similarity()` by
+backing off to context-sensitive tensors from the `tok2vec` component. These
+tensors do not work well for this purpose and this backoff has been removed in
+spaCy v4.
+
 </Infobox>
 
 Pipeline packages that come with built-in word vectors make them available as

From a731864d5c75a58efaaacd4cf3954572ad294e99 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 1 Mar 2023 16:00:02 +0100
Subject: [PATCH 281/504] Return Tuple[Span] for all Doc/Span attrs that
 provide spans (#12288)

* Return Tuple[Span] for all Doc/Span attrs that provide spans

* Update Span types
---
 spacy/tokens/doc.pyi      |  4 ++--
 spacy/tokens/doc.pyx      | 26 ++++++++++++--------------
 spacy/tokens/span.pyi     |  4 +++-
 spacy/tokens/span.pyx     | 28 ++++++++++++++++------------
 website/docs/api/doc.mdx  | 23 +++++++++++------------
 website/docs/api/span.mdx | 33 ++++++++++++++++-----------------
 6 files changed, 60 insertions(+), 58 deletions(-)

diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 11f8a1c5eb8..2b39d5baa28 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -155,9 +155,9 @@ class Doc:
         default: str = ...,
     ) -> None: ...
     @property
-    def noun_chunks(self) -> Iterator[Span]: ...
+    def noun_chunks(self) -> Tuple[Span]: ...
     @property
-    def sents(self) -> Iterator[Span]: ...
+    def sents(self) -> Tuple[Span]: ...
     @property
     def lang(self) -> int: ...
     @property
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 7d4382f0ace..3880920a8b9 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -750,12 +750,11 @@ cdef class Doc:
         """
         return self.text
 
-    @property
-    def ents(self):
-        """The named entities in the document. Returns a tuple of named entity
+    property ents:
+        """The named entities in the document. Returns a list of named entity
         `Span` objects, if the entity recognizer has been applied.
 
-        RETURNS (tuple): Entities in the document, one `Span` per entity.
+        RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity.
 
         DOCS: https://spacy.io/api/doc#ents
         """
@@ -913,7 +912,7 @@ cdef class Doc:
         NP-level coordination, no prepositional phrases, and no relative
         clauses.
 
-        YIELDS (Span): Noun chunks in the document.
+        RETURNS (Tuple[Span]): Noun chunks in the document.
 
         DOCS: https://spacy.io/api/doc#noun_chunks
         """
@@ -922,36 +921,35 @@ cdef class Doc:
 
         # Accumulate the result before beginning to iterate over it. This
         # prevents the tokenization from being changed out from under us
-        # during the iteration. The tricky thing here is that Span accepts
-        # its tokenization changing, so it's okay once we have the Span
-        # objects. See Issue #375.
+        # during the iteration.
         spans = []
         for start, end, label in self.noun_chunks_iterator(self):
             spans.append(Span(self, start, end, label=label))
-        for span in spans:
-            yield span
+        return tuple(spans)
 
     @property
     def sents(self):
         """Iterate over the sentences in the document. Yields sentence `Span`
         objects. Sentence spans have no label.
 
-        YIELDS (Span): Sentences in the document.
+        RETURNS (Tuple[Span]): Sentences in the document.
 
         DOCS: https://spacy.io/api/doc#sents
         """
         if not self.has_annotation("SENT_START"):
             raise ValueError(Errors.E030)
         if "sents" in self.user_hooks:
-            yield from self.user_hooks["sents"](self)
+            return tuple(self.user_hooks["sents"](self))
         else:
             start = 0
+            spans = []
             for i in range(1, self.length):
                 if self.c[i].sent_start == 1:
-                    yield Span(self, start, i)
+                    spans.append(Span(self, start, i))
                     start = i
             if start != self.length:
-                yield Span(self, start, self.length)
+                spans.append(Span(self, start, self.length))
+            return tuple(spans)
 
     @property
     def lang(self):
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 3c85542bb3d..2a529593e5f 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -81,6 +81,8 @@ class Span:
     @property
     def ents(self) -> Tuple[Span]: ...
     @property
+    def sents(self) -> Tuple[Span]: ...
+    @property
     def has_vector(self) -> bool: ...
     @property
     def vector(self) -> Floats1d: ...
@@ -93,7 +95,7 @@ class Span:
     @property
     def text_with_ws(self) -> str: ...
     @property
-    def noun_chunks(self) -> Iterator[Span]: ...
+    def noun_chunks(self) -> Tuple[Span]: ...
     @property
     def root(self) -> Token: ...
     def char_span(
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 89a785ef23e..c9cef2bcdaa 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -487,20 +487,21 @@ cdef class Span:
         """Obtain the sentences that contain this span. If the given span
         crosses sentence boundaries, return all sentences it is a part of.
 
-        RETURNS (Iterable[Span]): All sentences that the span is a part of.
+        RETURNS (Tuple[Span]): All sentences that the span is a part of.
 
-         DOCS: https://spacy.io/api/span#sents
+        DOCS: https://spacy.io/api/span#sents
         """
         cdef int start
         cdef int i
 
         if "sents" in self.doc.user_span_hooks:
-            yield from self.doc.user_span_hooks["sents"](self)
-        elif "sents" in self.doc.user_hooks:
+            return tuple(self.doc.user_span_hooks["sents"](self))
+        spans = []
+        if "sents" in self.doc.user_hooks:
             for sentence in self.doc.user_hooks["sents"](self.doc):
                 if sentence.end > self.start:
                     if sentence.start < self.end or sentence.start == self.start == self.end:
-                        yield sentence
+                        spans.append(sentence)
                     else:
                         break
         else:
@@ -515,12 +516,13 @@ cdef class Span:
             # Now, find all the sentences in the span
             for i in range(start + 1, self.doc.length):
                 if self.doc.c[i].sent_start == 1:
-                    yield Span(self.doc, start, i)
+                    spans.append(Span(self.doc, start, i))
                     start = i
                     if start >= self.end:
                         break
-                elif i == self.doc.length - 1:
-                    yield Span(self.doc, start, self.doc.length)
+            if start < self.end:
+                spans.append(Span(self.doc, start, self.end))
+        return tuple(spans)
 
             # Ensure that trailing parts of the Span instance are included in last element of .sents.
             if start == self.doc.length - 1:
@@ -531,7 +533,7 @@ cdef class Span:
         """The named entities that fall completely within the span. Returns
         a tuple of `Span` objects.
 
-        RETURNS (tuple): Entities in the span, one `Span` per entity.
+        RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity.
 
         DOCS: https://spacy.io/api/span#ents
         """
@@ -546,7 +548,7 @@ cdef class Span:
                     ents.append(ent)
                 else:
                     break
-        return ents
+        return tuple(ents)
 
     @property
     def has_vector(self):
@@ -641,13 +643,15 @@ cdef class Span:
         NP-level coordination, no prepositional phrases, and no relative
         clauses.
 
-        YIELDS (Span): Noun chunks in the span.
+        RETURNS (Tuple[Span]): Noun chunks in the span.
 
         DOCS: https://spacy.io/api/span#noun_chunks
         """
+        spans = []
         for span in self.doc.noun_chunks:
             if span.start >= self.start and span.end <= self.end:
-                yield span
+                spans.append(span)
+        return tuple(spans)
 
     @property
     def root(self):
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index f53e209afc8..e92c0e833e0 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -654,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer).
 
 ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
 
-Iterate over the base noun phrases in the document. Yields base noun-phrase
-`Span` objects, if the document has been syntactically parsed. A base noun
-phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
-nested within it – so no NP-level coordination, no prepositional phrases, and no
-relative clauses.
+Returns a tuple of the base noun phrases in the doc, if the document has been
+syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
+does not permit other NPs to be nested within it – so no NP-level coordination,
+no prepositional phrases, and no relative clauses.
 
 To customize the noun chunk iterator in a loaded pipeline, modify
 [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
@@ -675,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised.
 > assert chunks[1].text == "another phrase"
 > ```
 
-| Name       | Description                           |
-| ---------- | ------------------------------------- |
-| **YIELDS** | Noun chunks in the document. ~~Span~~ |
+| Name        | Description                                  |
+| ----------- | -------------------------------------------- |
+| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
 
 ## Doc.sents {id="sents",tag="property",model="sentences"}
 
-Iterate over the sentences in the document. Sentence spans have no label.
+Returns a tuple of the sentences in the document. Sentence spans have no label.
 
 This property is only available when
 [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
@@ -697,9 +696,9 @@ will raise an error otherwise.
 > assert [s.root.text for s in sents] == ["is", "'s"]
 > ```
 
-| Name       | Description                         |
-| ---------- | ----------------------------------- |
-| **YIELDS** | Sentences in the document. ~~Span~~ |
+| Name        | Description                                |
+| ----------- | ------------------------------------------ |
+| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
 
 ## Doc.has_vector {id="has_vector",tag="property",model="vectors"}
 
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index ae7ef7203b6..cd70d8dcead 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -275,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of
 > assert ents[0].text == "Mr. Best"
 > ```
 
-| Name        | Description                                                       |
-| ----------- | ----------------------------------------------------------------- |
-| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
+| Name        | Description                                                  |
+| ----------- | ------------------------------------------------------------ |
+| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ |
 
 ## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"}
 
-Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
-objects, if the document has been syntactically parsed. A base noun phrase, or
-"NP chunk", is a noun phrase that does not permit other NPs to be nested within
-it – so no NP-level coordination, no prepositional phrases, and no relative
-clauses.
+Returns a tuple of the base noun phrases in the span if the document has been
+syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
+does not permit other NPs to be nested within it – so no NP-level coordination,
+no prepositional phrases, and no relative clauses.
 
 If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
 has not been implemeted for the given language, a `NotImplementedError` is
@@ -301,9 +300,9 @@ raised.
 > assert chunks[0].text == "another phrase"
 > ```
 
-| Name       | Description                       |
-| ---------- | --------------------------------- |
-| **YIELDS** | Noun chunks in the span. ~~Span~~ |
+| Name        | Description                              |
+| ----------- | ---------------------------------------- |
+| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ |
 
 ## Span.as_doc {id="as_doc",tag="method"}
 
@@ -525,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)]
 
 ## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"}
 
-Returns a generator over the sentences the span belongs to. This property is
-only available when [sentence boundaries](/usage/linguistic-features#sbd) have
-been set on the document by the `parser`, `senter`, `sentencizer` or some custom
+Returns a tuple of the sentences the span belongs to. This property is only
+available when [sentence boundaries](/usage/linguistic-features#sbd) have been
+set on the document by the `parser`, `senter`, `sentencizer` or some custom
 function. It will raise an error otherwise.
 
 If the span happens to cross sentence boundaries, all sentences the span
@@ -541,9 +540,9 @@ overlaps with will be returned.
 > assert len(span.sents) == 2
 > ```
 
-| Name        | Description                                                                |
-| ----------- | -------------------------------------------------------------------------- |
-| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
+| Name        | Description                                                   |
+| ----------- | ------------------------------------------------------------- |
+| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ |
 
 ## Attributes {id="attributes"}
 

From c1f7202c7fa7db1717ae53d584b3f0cd3b70199e Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Tue, 7 Mar 2023 13:10:45 +0100
Subject: [PATCH 282/504] Drop support for EntityLinker_v1. (#12377)

---
 spacy/errors.py                            |  1 +
 spacy/pipeline/entity_linker.py            | 23 ++--------------------
 spacy/tests/pipeline/test_entity_linker.py |  7 +------
 3 files changed, 4 insertions(+), 27 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index c8c595395b3..83a1e9ba2c0 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -980,6 +980,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4003 = ("Training examples for distillation must have the exact same tokens in the "
              "reference and predicted docs.")
     E4004 = ("Backprop is not supported when is_train is not set.")
+    E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 5fef27a44c5..208c80d3640 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -131,28 +131,9 @@ def make_entity_linker(
         prediction is discarded. If None, predictions are not filtered by any threshold.
     save_activations (bool): save model activations in Doc when annotating.
     """
-
     if not model.attrs.get("include_span_maker", False):
-        try:
-            from spacy_legacy.components.entity_linker import EntityLinker_v1
-        except:
-            raise ImportError(
-                "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
-            )
-        # The only difference in arguments here is that use_gold_ents and threshold aren't available.
-        return EntityLinker_v1(
-            nlp.vocab,
-            model,
-            name,
-            labels_discard=labels_discard,
-            n_sents=n_sents,
-            incl_prior=incl_prior,
-            incl_context=incl_context,
-            entity_vector_length=entity_vector_length,
-            get_candidates=get_candidates,
-            overwrite=overwrite,
-            scorer=scorer,
-        )
+        raise ValueError(Errors.E4005)
+
     return EntityLinker(
         nlp.vocab,
         model,
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 8c1759e166e..51ec3255a09 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1083,13 +1083,11 @@ def test_scorer_links():
 @pytest.mark.parametrize(
     "name,config",
     [
-        ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
         ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
     ],
 )
 # fmt: on
 def test_legacy_architectures(name, config):
-    from spacy_legacy.components.entity_linker import EntityLinker_v1
 
     # Ensure that the legacy architectures still work
     vector_length = 3
@@ -1112,10 +1110,7 @@ def create_kb(vocab):
         return mykb
 
     entity_linker = nlp.add_pipe(name, config={"model": config})
-    if config["@architectures"] == "spacy.EntityLinker.v1":
-        assert isinstance(entity_linker, EntityLinker_v1)
-    else:
-        assert isinstance(entity_linker, EntityLinker)
+    assert isinstance(entity_linker, EntityLinker)
     entity_linker.set_kb(create_kb)
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
 

From 08967d6b3f602eb486d1c18a2d3642662bad4be6 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 9 Mar 2023 09:37:19 +0100
Subject: [PATCH 283/504] `Tok2Vec`: Add `distill` method (#12108)

* `Tok2Vec`: Add `distill` method

* `Tok2Vec`: Refactor `update`

* Add `Tok2Vec.distill` test

* Update `distill` signature to accept `Example`s instead of separate teacher and student docs

* Add docs

* Remove docstring

* Update test

* Remove `update` calls from test

* Update `Tok2Vec.distill` docstring
---
 spacy/pipeline/tok2vec.py            | 125 ++++++++++++++++++++-------
 spacy/tests/pipeline/test_tok2vec.py | 117 +++++++++++++++----------
 website/docs/api/tok2vec.mdx         |  37 ++++++++
 3 files changed, 204 insertions(+), 75 deletions(-)

diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 677f5eec16c..f168aee2ec4 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -1,3 +1,6 @@
+from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple
+from thinc.api import Model, set_dropout_rate, Optimizer, Config
+from thinc.types import Floats2d
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence
 
@@ -158,39 +161,9 @@ def update(
 
         DOCS: https://spacy.io/api/tok2vec#update
         """
-        if losses is None:
-            losses = {}
         validate_examples(examples, "Tok2Vec.update")
         docs = [eg.predicted for eg in examples]
-        set_dropout_rate(self.model, drop)
-        tokvecs, bp_tokvecs = self.model.begin_update(docs)
-        d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
-        losses.setdefault(self.name, 0.0)
-
-        def accumulate_gradient(one_d_tokvecs):
-            """Accumulate tok2vec loss and gradient. This is passed as a callback
-            to all but the last listener. Only the last one does the backprop.
-            """
-            nonlocal d_tokvecs
-            for i in range(len(one_d_tokvecs)):
-                d_tokvecs[i] += one_d_tokvecs[i]
-                losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
-            return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
-
-        def backprop(one_d_tokvecs):
-            """Callback to actually do the backprop. Passed to last listener."""
-            accumulate_gradient(one_d_tokvecs)
-            d_docs = bp_tokvecs(d_tokvecs)
-            if sgd is not None:
-                self.finish_update(sgd)
-            return d_docs
-
-        batch_id = Tok2VecListener.get_batch_id(docs)
-        for listener in self.listeners[:-1]:
-            listener.receive(batch_id, tokvecs, accumulate_gradient)
-        if self.listeners:
-            self.listeners[-1].receive(batch_id, tokvecs, backprop)
-        return losses
+        return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses)
 
     def get_loss(self, examples, scores) -> None:
         pass
@@ -220,6 +193,96 @@ def initialize(
     def add_label(self, label):
         raise NotImplementedError
 
+    def distill(
+        self,
+        teacher_pipe: Optional["TrainablePipe"],
+        examples: Iterable["Example"],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+    ) -> Dict[str, float]:
+        """Performs an update of the student pipe's model using the
+        student's distillation examples and sets the annotations
+        of the teacher's distillation examples using the teacher pipe.
+
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use
+            for prediction.
+        examples (Iterable[Example]): Distillation examples. The reference (teacher)
+            and predicted (student) docs must have the same number of tokens and the
+            same orthography.
+        drop (float): dropout rate.
+        sgd (Optional[Optimizer]): An optimizer. Will be created via
+            create_optimizer if not set.
+        losses (Optional[Dict[str, float]]): Optional record of loss during
+            distillation.
+        RETURNS: The updated losses dictionary.
+
+        DOCS: https://spacy.io/api/tok2vec#distill
+        """
+        # By default we require a teacher pipe, but there are downstream
+        # implementations that don't require a pipe.
+        if teacher_pipe is None:
+            raise ValueError(Errors.E4002.format(name=self.name))
+        teacher_docs = [eg.reference for eg in examples]
+        student_docs = [eg.predicted for eg in examples]
+        teacher_preds = teacher_pipe.predict(teacher_docs)
+        teacher_pipe.set_annotations(teacher_docs, teacher_preds)
+        return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses)
+
+    def _update_with_docs(
+        self,
+        docs: Iterable[Doc],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+    ):
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        set_dropout_rate(self.model, drop)
+
+        tokvecs, accumulate_gradient, backprop = self._create_backprops(
+            docs, losses, sgd=sgd
+        )
+        batch_id = Tok2VecListener.get_batch_id(docs)
+        for listener in self.listeners[:-1]:
+            listener.receive(batch_id, tokvecs, accumulate_gradient)
+        if self.listeners:
+            self.listeners[-1].receive(batch_id, tokvecs, backprop)
+        return losses
+
+    def _create_backprops(
+        self,
+        docs: Iterable[Doc],
+        losses: Dict[str, float],
+        *,
+        sgd: Optional[Optimizer] = None,
+    ) -> Tuple[Floats2d, Callable, Callable]:
+        tokvecs, bp_tokvecs = self.model.begin_update(docs)
+        d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
+
+        def accumulate_gradient(one_d_tokvecs):
+            """Accumulate tok2vec loss and gradient. This is passed as a callback
+            to all but the last listener. Only the last one does the backprop.
+            """
+            nonlocal d_tokvecs
+            for i in range(len(one_d_tokvecs)):
+                d_tokvecs[i] += one_d_tokvecs[i]
+                losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
+            return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
+
+        def backprop(one_d_tokvecs):
+            """Callback to actually do the backprop. Passed to last listener."""
+            accumulate_gradient(one_d_tokvecs)
+            d_docs = bp_tokvecs(d_tokvecs)
+            if sgd is not None:
+                self.finish_update(sgd)
+            return d_docs
+
+        return tokvecs, accumulate_gradient, backprop
+
 
 class Tok2VecListener(Model):
     """A layer that gets fed its answers from an upstream connection,
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 9648341a106..e557e294112 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -561,55 +561,84 @@ def test_tok2vec_listeners_textcat():
     assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]
 
 
-def test_tok2vec_listener_source_link_name():
-    """The component's internal name and the tok2vec listener map correspond
-    to the most recently modified pipeline.
-    """
-    orig_config = Config().from_str(cfg_string_multi)
-    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
-    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+cfg_string_distillation = """
+    [nlp]
+    lang = "en"
+    pipeline = ["tok2vec","tagger"]
+
+    [components]
+
+    [components.tagger]
+    factory = "tagger"
 
-    nlp2 = English()
-    nlp2.add_pipe("tok2vec", source=nlp1)
-    nlp2.add_pipe("tagger", name="tagger2", source=nlp1)
+    [components.tagger.model]
+    @architectures = "spacy.Tagger.v2"
+    nO = null
+
+    [components.tagger.model.tok2vec]
+    @architectures = "spacy.Tok2VecListener.v1"
+    width = ${components.tok2vec.model.encode.width}
+
+    [components.tok2vec]
+    factory = "tok2vec"
 
-    # there is no way to have the component have the right name for both
-    # pipelines, right now the most recently modified pipeline is prioritized
-    assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2"
+    [components.tok2vec.model]
+    @architectures = "spacy.Tok2Vec.v2"
 
-    # there is no way to have the tok2vec have the right listener map for both
-    # pipelines, right now the most recently modified pipeline is prioritized
-    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
-    nlp2.add_pipe("ner", name="ner3", source=nlp1)
-    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"]
-    nlp2.remove_pipe("ner3")
-    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
-    nlp2.remove_pipe("tagger2")
-    assert nlp2.get_pipe("tok2vec").listening_components == []
+    [components.tok2vec.model.embed]
+    @architectures = "spacy.MultiHashEmbed.v2"
+    width = ${components.tok2vec.model.encode.width}
+    rows = [2000, 1000, 1000, 1000]
+    attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+    include_static_vectors = false
 
-    # at this point the tok2vec component corresponds to nlp2
-    assert nlp1.get_pipe("tok2vec").listening_components == []
+    [components.tok2vec.model.encode]
+    @architectures = "spacy.MaxoutWindowEncoder.v2"
+    width = 96
+    depth = 4
+    window_size = 1
+    maxout_pieces = 3
+    """
 
-    # modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1
-    nlp1.add_pipe("sentencizer")
-    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
 
-    # modifying nlp2 syncs it back to nlp2
-    nlp2.add_pipe("sentencizer")
-    assert nlp1.get_pipe("tok2vec").listening_components == []
+def test_tok2vec_distillation_teacher_annotations():
+    orig_config = Config().from_str(cfg_string_distillation)
+    teacher_nlp = util.load_model_from_config(
+        orig_config, auto_fill=True, validate=True
+    )
+    student_nlp = util.load_model_from_config(
+        orig_config, auto_fill=True, validate=True
+    )
 
+    train_examples_teacher = []
+    train_examples_student = []
+    for t in TRAIN_DATA:
+        train_examples_teacher.append(
+            Example.from_dict(teacher_nlp.make_doc(t[0]), t[1])
+        )
+        train_examples_student.append(
+            Example.from_dict(student_nlp.make_doc(t[0]), t[1])
+        )
 
-def test_tok2vec_listener_source_replace_listeners():
-    orig_config = Config().from_str(cfg_string_multi)
-    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
-    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
-    nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
-    assert nlp1.get_pipe("tok2vec").listening_components == ["ner"]
-
-    nlp2 = English()
-    nlp2.add_pipe("tok2vec", source=nlp1)
-    assert nlp2.get_pipe("tok2vec").listening_components == []
-    nlp2.add_pipe("tagger", source=nlp1)
-    assert nlp2.get_pipe("tok2vec").listening_components == []
-    nlp2.add_pipe("ner", name="ner2", source=nlp1)
-    assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"]
+    optimizer = teacher_nlp.initialize(lambda: train_examples_teacher)
+    student_nlp.initialize(lambda: train_examples_student)
+
+    # Since Language.distill creates a copy of the examples to use as
+    # its internal teacher/student docs, we'll need to monkey-patch the
+    # tok2vec pipe's distill method.
+    student_tok2vec = student_nlp.get_pipe("tok2vec")
+    student_tok2vec._old_distill = student_tok2vec.distill
+
+    def tok2vec_distill_wrapper(
+        self,
+        teacher_pipe,
+        examples,
+        **kwargs,
+    ):
+        assert all(not eg.reference.tensor.any() for eg in examples)
+        out = self._old_distill(teacher_pipe, examples, **kwargs)
+        assert all(eg.reference.tensor.any() for eg in examples)
+        return out
+
+    student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec)
+    student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={})
diff --git a/website/docs/api/tok2vec.mdx b/website/docs/api/tok2vec.mdx
index a1bb1265eae..8b6d2380bae 100644
--- a/website/docs/api/tok2vec.mdx
+++ b/website/docs/api/tok2vec.mdx
@@ -100,6 +100,43 @@ pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
+## Tok2Vec.distill {id="distill", tag="method,experimental", version="4"}
+
+Performs an update of the student pipe's model using the student's distillation 
+examples and sets the annotations of the teacher's distillation examples using 
+the teacher pipe. 
+
+Unlike other trainable pipes, the student pipe doesn't directly learn its 
+representations from the teacher. However, since downstream pipes that do 
+perform distillation expect the tok2vec annotations to be present on the 
+correct distillation examples, we need to ensure that they are set beforehand.
+
+The distillation is performed on ~~Example~~ objects. The `Example.reference`
+and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
+same orthography. Even though the reference does not need have to have gold
+annotations, the teacher could adds its own annotations when necessary.
+
+This feature is experimental.
+
+> #### Example
+>
+> ```python
+> teacher_pipe = teacher.add_pipe("tok2vec")
+> student_pipe = student.add_pipe("tok2vec")
+> optimizer = nlp.resume_training()
+> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
+> ```
+
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to use for prediction. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+
 ## Tok2Vec.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood

From f8844153a987e1eb2930eefe6229a4f9681fbab8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 19 Mar 2023 23:41:20 +0100
Subject: [PATCH 284/504] Clean up Vocab constructor (#12290)

* Clean up Vocab constructor

* Change effective type of `strings` from `Iterable[str]` to `Optional[StringStore]`
  * Don't automatically add strings to vocab
* Change default values to `None`
* Remove `**deprecated_kwargs`

* Format
---
 spacy/strings.pyi                             |  2 +-
 spacy/tests/pipeline/test_pipe_methods.py     |  3 ++-
 .../serialize/test_serialize_vocab_strings.py | 27 +++++++++++--------
 spacy/tests/vocab_vectors/test_lexeme.py      |  2 +-
 spacy/vocab.pyi                               |  2 +-
 spacy/vocab.pyx                               | 18 +++++++------
 website/docs/api/vocab.mdx                    |  5 ++--
 7 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/spacy/strings.pyi b/spacy/strings.pyi
index 8b7c0d6bd5a..393661f591d 100644
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@@ -3,7 +3,7 @@ from pathlib import Path
 from typing import Any, Iterable, Iterator, Optional, Union, overload
 
 class StringStore:
-    def __init__(self, strings: Optional[Iterable[str]]) -> None: ...
+    def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ...
     @overload
     def __getitem__(self, string_or_hash: str) -> int: ...
     @overload
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 9b9786f0458..39611a74278 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -9,6 +9,7 @@
 from spacy.lang.en.syntax_iterators import noun_chunks
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
+from spacy.strings import StringStore
 from spacy.tokens import Doc
 from spacy.training import Example
 from spacy.util import SimpleFrozenList, get_arg_names, make_tempdir
@@ -131,7 +132,7 @@ def test_issue5458():
     # Test that the noun chuncker does not generate overlapping spans
     # fmt: off
     words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
-    vocab = Vocab(strings=words)
+    vocab = Vocab(strings=StringStore(words))
     deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
     pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
     heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py
index fd80c3d8e4f..f6356ac9e01 100644
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@@ -13,8 +13,11 @@
 
 from ..util import make_tempdir
 
-test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
-test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
+test_strings = [
+    (StringStore(), StringStore()),
+    (StringStore(["rats", "are", "cute"]), StringStore(["i", "like", "rats"])),
+]
+test_strings_attrs = [(StringStore(["rats", "are", "cute"]), "Hello")]
 
 
 @pytest.mark.issue(599)
@@ -81,7 +84,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
     vocab2 = Vocab(strings=strings2)
     vocab1_b = vocab1.to_bytes()
     vocab2_b = vocab2.to_bytes()
-    if strings1 == strings2:
+    if strings1.to_bytes() == strings2.to_bytes():
         assert vocab1_b == vocab2_b
     else:
         assert vocab1_b != vocab2_b
@@ -117,11 +120,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
 def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
     vocab1 = Vocab(strings=strings)
     vocab2 = Vocab()
-    vocab1[strings[0]].norm_ = lex_attr
-    assert vocab1[strings[0]].norm_ == lex_attr
-    assert vocab2[strings[0]].norm_ != lex_attr
+    s = next(iter(vocab1.strings))
+    vocab1[s].norm_ = lex_attr
+    assert vocab1[s].norm_ == lex_attr
+    assert vocab2[s].norm_ != lex_attr
     vocab2 = vocab2.from_bytes(vocab1.to_bytes())
-    assert vocab2[strings[0]].norm_ == lex_attr
+    assert vocab2[s].norm_ == lex_attr
 
 
 @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
@@ -136,14 +140,15 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
 def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
     vocab1 = Vocab(strings=strings)
     vocab2 = Vocab()
-    vocab1[strings[0]].norm_ = lex_attr
-    assert vocab1[strings[0]].norm_ == lex_attr
-    assert vocab2[strings[0]].norm_ != lex_attr
+    s = next(iter(vocab1.strings))
+    vocab1[s].norm_ = lex_attr
+    assert vocab1[s].norm_ == lex_attr
+    assert vocab2[s].norm_ != lex_attr
     with make_tempdir() as d:
         file_path = d / "vocab"
         vocab1.to_disk(file_path)
         vocab2 = vocab2.from_disk(file_path)
-    assert vocab2[strings[0]].norm_ == lex_attr
+    assert vocab2[s].norm_ == lex_attr
 
 
 @pytest.mark.parametrize("strings1,strings2", test_strings)
diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py
index 156e3391aa2..dc2c80bcdd0 100644
--- a/spacy/tests/vocab_vectors/test_lexeme.py
+++ b/spacy/tests/vocab_vectors/test_lexeme.py
@@ -18,7 +18,7 @@ def test_issue361(en_vocab, text1, text2):
 
 @pytest.mark.issue(600)
 def test_issue600():
-    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
+    vocab = Vocab()
     doc = Doc(vocab, words=["hello"])
     doc[0].tag_ = "NN"
 
diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi
index 7fbb9764f10..beb7febee63 100644
--- a/spacy/vocab.pyi
+++ b/spacy/vocab.pyi
@@ -27,7 +27,7 @@ class Vocab:
     def __init__(
         self,
         lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ...,
-        strings: Optional[Union[List[str], StringStore]] = ...,
+        strings: Optional[StringStore] = ...,
         lookups: Optional[Lookups] = ...,
         oov_prob: float = ...,
         writing_system: Dict[str, Any] = ...,
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 8fbd1c5d3dc..f317afd8924 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -46,9 +46,8 @@ cdef class Vocab:
 
     DOCS: https://spacy.io/api/vocab
     """
-    def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
-                 oov_prob=-20., writing_system={}, get_noun_chunks=None,
-                 **deprecated_kwargs):
+    def __init__(self, lex_attr_getters=None, strings=None, lookups=None,
+            oov_prob=-20., writing_system=None, get_noun_chunks=None):
         """Create the vocabulary.
 
         lex_attr_getters (dict): A dictionary mapping attribute IDs to
@@ -66,16 +65,19 @@ cdef class Vocab:
         self.cfg = {'oov_prob': oov_prob}
         self.mem = Pool()
         self._by_orth = PreshMap()
-        self.strings = StringStore()
         self.length = 0
-        if strings:
-            for string in strings:
-                _ = self[string]
+        if strings is None:
+            self.strings = StringStore()
+        else:
+            self.strings = strings
         self.lex_attr_getters = lex_attr_getters
         self.morphology = Morphology(self.strings)
         self.vectors = Vectors(strings=self.strings)
         self.lookups = lookups
-        self.writing_system = writing_system
+        if writing_system is None:
+            self.writing_system = {}
+        else:
+            self.writing_system = writing_system
         self.get_noun_chunks = get_noun_chunks
 
     @property
diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx
index 36369c78427..88d3939142f 100644
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@@ -24,14 +24,15 @@ Create the vocabulary.
 > #### Example
 >
 > ```python
+> from spacy.strings import StringStore
 > from spacy.vocab import Vocab
-> vocab = Vocab(strings=["hello", "world"])
+> vocab = Vocab(strings=StringStore(["hello", "world"]))
 > ```
 
 | Name               | Description                                                                                                                                                             |
 | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~                                      |
-| `strings`          | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~                           |
+| `strings`          | A [`StringStore`](/api/stringstore) that maps strings to hash values. ~~Optional[StringStore]~~                                                                         |
 | `lookups`          | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~                                      |
 | `oov_prob`         | The default OOV probability. Defaults to `-20.0`. ~~float~~                                                                                                             |
 | `writing_system`   | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~                          |

From 1a51cc4ef8e214fa8877061f567c44d573231083 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 20 Mar 2023 00:34:35 +0100
Subject: [PATCH 285/504] Introduce hierarchy for EL `Candidate` objects
 (#12341)

* Convert Candidate from Cython to Python class.

* Format.

* Fix .entity_ typo in _add_activations() usage.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update doc string of BaseCandidate.__init__().

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate.

* Adjust Candidate to support and mandate numerical entity IDs.

* Format.

* Fix docstring and docs.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename alias -> mention.

* Refactor Candidate attribute names. Update docs and tests accordingly.

* Refacor Candidate attributes and their usage.

* Format.

* Fix mypy error.

* Update error code in line with v4 convention.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Updated error code.

* Simplify interface for int/str representations.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename 'alias' to 'mention'.

* Port Candidate and InMemoryCandidate to Cython.

* Remove redundant entry in setup.py.

* Add abstract class check.

* Drop storing mention.

* Update spacy/kb/candidate.pxd

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fix entity_id refactoring problems in docstrings.

* Drop unused InMemoryCandidate._entity_hash.

* Update docstrings.

* Move attributes out of Candidate.

* Partially fix alias/mention terminology usage. Convert Candidate to interface.

* Remove prior_prob from supported properties in Candidate. Introduce KnowledgeBase.supports_prior_probs().

* Update docstrings related to prior_prob.

* Update alias/mention usage in doc(strings).

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Mention -> alias renaming. Drop Candidate.mentions(). Drop InMemoryLookupKB.get_alias_candidates() from docs.

* Update docstrings.

* Fix InMemoryCandidate attribute names.

* Update spacy/kb/kb.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update W401 test.

* Update spacy/errors.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/kb/kb.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Use Candidate output type for toy generators in the test suite to mimick best practices

* fix docs

* fix import

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/errors.py                            |   8 +-
 spacy/kb/__init__.py                       |  10 +-
 spacy/kb/candidate.pxd                     |  22 ++--
 spacy/kb/candidate.pyx                     | 135 +++++++++++----------
 spacy/kb/kb.pyx                            |  23 ++--
 spacy/kb/kb_in_memory.pyx                  |  36 +++---
 spacy/ml/models/entity_linker.py           |  28 +++++
 spacy/pipeline/entity_linker.py            |  61 ++++++++--
 spacy/tests/pipeline/test_entity_linker.py |  48 ++++----
 spacy/tests/serialize/test_serialize_kb.py |  12 +-
 website/docs/api/inmemorylookupkb.mdx      |  40 ++----
 website/docs/api/kb.mdx                    |  51 +++-----
 12 files changed, 263 insertions(+), 211 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 83a1e9ba2c0..42fdc12e029 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes):
             "ignoring the duplicate entry.")
     W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
             "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
-    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
+    W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
             "the Knowledge Base.")
     W026 = ("Unable to set all sentence boundaries from dependency parses. If "
             "you are constructing a parse tree incrementally by setting "
@@ -214,7 +214,11 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
+    # v4 warning strings
     W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
+    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
+            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
+            "to return `True` in `.supports_prior_probs`.")
 
 
 class Errors(metaclass=ErrorsWithCodes):
@@ -981,6 +985,8 @@ class Errors(metaclass=ErrorsWithCodes):
              "reference and predicted docs.")
     E4004 = ("Backprop is not supported when is_train is not set.")
     E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
+    E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
+
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
index 93a65ab6194..fb21083ddee 100644
--- a/spacy/kb/__init__.py
+++ b/spacy/kb/__init__.py
@@ -1,11 +1,7 @@
 from .candidate import Candidate, get_candidates, get_candidates_batch
 from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB
+from .candidate import Candidate, InMemoryCandidate
 
-__all__ = [
-    "Candidate",
-    "KnowledgeBase",
-    "InMemoryLookupKB",
-    "get_candidates",
-    "get_candidates_batch",
-]
+
+__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd
index 80fcbc45940..f21f423e496 100644
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@@ -1,15 +1,15 @@
 from libcpp.vector cimport vector
-
+from .kb_in_memory cimport InMemoryLookupKB
 from ..typedefs cimport hash_t
-from .kb cimport KnowledgeBase
-
 
-# Object used by the Entity Linker that summarizes one entity-alias candidate
-# combination.
 cdef class Candidate:
-    cdef readonly KnowledgeBase kb
-    cdef hash_t entity_hash
-    cdef float entity_freq
-    cdef vector[float] entity_vector
-    cdef hash_t alias_hash
-    cdef float prior_prob
+    pass
+
+
+cdef class InMemoryCandidate(Candidate):
+    cdef readonly hash_t _entity_hash
+    cdef readonly hash_t _alias_hash
+    cpdef vector[float] _entity_vector
+    cdef float _prior_prob
+    cdef readonly InMemoryLookupKB _kb
+    cdef float _entity_freq
diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx
index 4369676e23a..bf66ccfae67 100644
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@@ -1,90 +1,97 @@
 # cython: infer_types=True
 
-from typing import Iterable
-
-from .kb cimport KnowledgeBase
-
-from ..tokens import Span
+from .kb_in_memory cimport InMemoryLookupKB
+from ..errors import Errors
 
 
 cdef class Candidate:
-    """A `Candidate` object refers to a textual mention (`alias`) that may or
-    may not be resolved to a specific `entity` from a Knowledge Base. This
-    will be used as input for the entity linking algorithm which will
-    disambiguate the various candidates to the correct one.
-    Each candidate (alias, entity) pair is assigned a certain prior probability.
+    """A `Candidate` object refers to a textual mention that may or may not be resolved
+    to a specific entity from a Knowledge Base. This will be used as input for the entity linking
+    algorithm which will disambiguate the various candidates to the correct one.
+    Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
+    is assigned a certain prior probability.
 
     DOCS: https://spacy.io/api/kb/#candidate-init
     """
 
-    def __init__(
-        self,
-        KnowledgeBase kb,
-        entity_hash,
-        entity_freq,
-        entity_vector,
-        alias_hash,
-        prior_prob
-    ):
-        self.kb = kb
-        self.entity_hash = entity_hash
-        self.entity_freq = entity_freq
-        self.entity_vector = entity_vector
-        self.alias_hash = alias_hash
-        self.prior_prob = prior_prob
+    def __init__(self):
+        # Make sure abstract Candidate is not instantiated.
+        if self.__class__ == Candidate:
+            raise TypeError(
+                Errors.E1046.format(cls_name=self.__class__.__name__)
+            )
 
     @property
-    def entity(self) -> int:
-        """RETURNS (uint64): hash of the entity's KB ID/name"""
-        return self.entity_hash
+    def entity_id(self) -> int:
+        """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
+        otherwise the hash of the entity ID string)."""
+        raise NotImplementedError
 
     @property
-    def entity_(self) -> str:
-        """RETURNS (str): ID/name of this entity in the KB"""
-        return self.kb.vocab.strings[self.entity_hash]
+    def entity_id_(self) -> str:
+        """RETURNS (str): String representation of entity ID."""
+        raise NotImplementedError
 
     @property
-    def alias(self) -> int:
-        """RETURNS (uint64): hash of the alias"""
-        return self.alias_hash
+    def entity_vector(self) -> vector[float]:
+        """RETURNS (vector[float]): Entity vector."""
+        raise NotImplementedError
 
-    @property
-    def alias_(self) -> str:
-        """RETURNS (str): ID of the original alias"""
-        return self.kb.vocab.strings[self.alias_hash]
+
+cdef class InMemoryCandidate(Candidate):
+    """Candidate for InMemoryLookupKB."""
+
+    def __init__(
+        self,
+        kb: InMemoryLookupKB,
+        entity_hash: int,
+        alias_hash: int,
+        entity_vector: vector[float],
+        prior_prob: float,
+        entity_freq: float
+    ):
+        """
+        kb (InMemoryLookupKB]): InMemoryLookupKB instance.
+        entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
+        entity_freq (int): Entity frequency in KB corpus.
+        entity_vector (List[float]): Entity embedding.
+        alias_hash (int): Alias hash.
+        prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
+            the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
+        """
+        super().__init__()
+
+        self._entity_hash = entity_hash
+        self._entity_vector = entity_vector
+        self._prior_prob = prior_prob
+        self._kb = kb
+        self._alias_hash = alias_hash
+        self._entity_freq = entity_freq
 
     @property
-    def entity_freq(self) -> float:
-        return self.entity_freq
+    def entity_id(self) -> int:
+        return self._entity_hash
 
     @property
-    def entity_vector(self) -> Iterable[float]:
-        return self.entity_vector
+    def entity_vector(self) -> vector[float]:
+        return self._entity_vector
 
     @property
     def prior_prob(self) -> float:
-        return self.prior_prob
-
+        """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
+        this entity."""
+        return self._prior_prob
 
-def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
-    """
-    Return candidate entities for a given mention and fetching appropriate
-    entries from the index.
-    kb (KnowledgeBase): Knowledge base to query.
-    mention (Span): Entity mention for which to identify candidates.
-    RETURNS (Iterable[Candidate]): Identified candidates.
-    """
-    return kb.get_candidates(mention)
+    @property
+    def alias(self) -> str:
+        """RETURNS (str): Alias."""
+        return self._kb.vocab.strings[self._alias_hash]
 
+    @property
+    def entity_id_(self) -> str:
+        return self._kb.vocab.strings[self._entity_hash]
 
-def get_candidates_batch(
-        kb: KnowledgeBase, mentions: Iterable[Span]
-) -> Iterable[Iterable[Candidate]]:
-    """
-    Return candidate entities for the given mentions and fetching appropriate entries
-    from the index.
-    kb (KnowledgeBase): Knowledge base to query.
-    mention (Iterable[Span]): Entity mentions for which to identify candidates.
-    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
-    """
-    return kb.get_candidates_batch(mentions)
+    @property
+    def entity_freq(self) -> float:
+        """RETURNS (float): Entity frequency in KB corpus."""
+        return self._entity_freq
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index c7db34e166a..24cee30ffc7 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -36,10 +36,10 @@ cdef class KnowledgeBase:
         self, mentions: Iterable[Span]
     ) -> Iterable[Iterable[Candidate]]:
         """
-        Return candidate entities for specified texts. Each candidate defines
-        the entity, the original alias, and the prior probability of that
-        alias resolving to that entity.
-        If no candidate is found for a given text, an empty list is returned.
+        Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
+        entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
+        probability of the specified mention text resolving to that entity - might be included.
+        If no candidates are found for a given mention, an empty list is returned.
         mentions (Iterable[Span]): Mentions for which to get candidates.
         RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
         """
@@ -47,10 +47,10 @@ cdef class KnowledgeBase:
 
     def get_candidates(self, mention: Span) -> Iterable[Candidate]:
         """
-        Return candidate entities for specified text. Each candidate defines
-        the entity, the original alias,
-        and the prior probability of that alias resolving to that entity.
-        If the no candidate is found for a given text, an empty list is returned.
+        Return candidate entities for a specific mention. Each candidate defines at least the entity and the
+        entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
+        probability of the specified mention text resolving to that entity - might be included.
+        If no candidate is found for the given mention, an empty list is returned.
         mention (Span): Mention for which to get candidates.
         RETURNS (Iterable[Candidate]): Identified candidates.
         """
@@ -128,3 +128,10 @@ cdef class KnowledgeBase:
                 parent="KnowledgeBase", method="from_disk", name=self.__name__
             )
         )
+
+    @property
+    def supports_prior_probs(self) -> bool:
+        """RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
+        raise NotImplementedError(
+            Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
+        )
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index 2b21f246a54..3aab0d73e72 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -22,8 +22,7 @@ from ..util import SimpleFrozenList, ensure_path
 
 from ..vocab cimport Vocab
 from .kb cimport KnowledgeBase
-
-from .candidate import Candidate as Candidate
+from .candidate import InMemoryCandidate
 
 
 cdef class InMemoryLookupKB(KnowledgeBase):
@@ -255,10 +254,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
             alias_entry.probs = probs
             self._aliases_table[alias_index] = alias_entry
 
-    def get_candidates(self, mention: Span) -> Iterable[Candidate]:
-        return self.get_alias_candidates(mention.text)  # type: ignore
+    def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
+        return self._get_alias_candidates(mention.text)  # type: ignore
 
-    def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
+    def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
         """
         Return candidate entities for an alias. Each candidate defines the
         entity, the original alias, and the prior probability of that alias
@@ -271,18 +270,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
 
-        return [Candidate(kb=self,
-                          entity_hash=self._entries[entry_index].entity_hash,
-                          entity_freq=self._entries[entry_index].freq,
-                          entity_vector=self._vectors_table[
-                              self._entries[entry_index].vector_index
-                          ],
-                          alias_hash=alias_hash,
-                          prior_prob=prior_prob)
-                for (entry_index, prior_prob) in zip(
-                    alias_entry.entry_indices, alias_entry.probs
-                )
-                if entry_index != 0]
+        return [
+            InMemoryCandidate(
+                kb=self,
+                entity_hash=self._entries[entry_index].entity_hash,
+                alias_hash=alias_hash,
+                entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
+                prior_prob=prior_prob,
+                entity_freq=self._entries[entry_index].freq
+            )
+            for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
+            if entry_index != 0
+        ]
 
     def get_vector(self, str entity):
         cdef hash_t entity_hash = self.vocab.strings[entity]
@@ -316,6 +315,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 
         return 0.0
 
+    def supports_prior_probs(self) -> bool:
+        return True
+
     def to_bytes(self, **kwargs):
         """Serialize the current state to a binary string.
         """
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index b7100c00a4b..99522c4617c 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -14,6 +14,12 @@
 )
 from thinc.types import Floats2d
 
+from ...util import registry
+from ...kb import KnowledgeBase, InMemoryLookupKB
+from ...kb import Candidate
+from ...vocab import Vocab
+from ...tokens import Span, Doc
+from ..extract_spans import extract_spans
 from ...errors import Errors
 from ...kb import (
     Candidate,
@@ -132,3 +138,25 @@ def create_candidates_batch() -> Callable[
     [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
 ]:
     return get_candidates_batch
+
+
+def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
+    """
+    Return candidate entities for a given mention and fetching appropriate entries from the index.
+    kb (KnowledgeBase): Knowledge base to query.
+    mention (Span): Entity mention for which to identify candidates.
+    RETURNS (Iterable[Candidate]): Identified candidates.
+    """
+    return kb.get_candidates(mention)
+
+
+def get_candidates_batch(
+    kb: KnowledgeBase, mentions: Iterable[Span]
+) -> Iterable[Iterable[Candidate]]:
+    """
+    Return candidate entities for the given mentions and fetching appropriate entries from the index.
+    kb (KnowledgeBase): Knowledge base to query.
+    mentions (Iterable[Span]): Entity mentions for which to identify candidates.
+    RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
+    """
+    return kb.get_candidates_batch(mentions)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 208c80d3640..29db232fcee 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,5 +1,5 @@
-from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
-from typing import cast
+import warnings
+from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast
 from numpy import dtype
 from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 from pathlib import Path
@@ -15,14 +15,13 @@
 from thinc.types import Floats2d
 
 from ..kb import KnowledgeBase, Candidate
-from ..ml import empty_kb
 from ..tokens import Doc, Span
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 from ..language import Language
 from ..vocab import Vocab
 from ..training import Example, validate_examples, validate_get_examples
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..util import SimpleFrozenList, registry
 from .. import util
 from ..errors import Errors
@@ -250,6 +249,8 @@ def __init__(
 
         if candidates_batch_size < 1:
             raise ValueError(Errors.E1044)
+        if self.incl_prior and not self.kb.supports_prior_probs:
+            warnings.warn(Warnings.W401)
 
         def _score_with_ents_set(examples: Iterable[Example], **kwargs):
             # Because of how spaCy works, we can't just score immediately, because Language.evaluate
@@ -539,17 +540,51 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                             entity_encodings = xp.asarray(
                                 [c.entity_vector for c in candidates]
                             )
-                            entity_norm = xp.linalg.norm(entity_encodings, axis=1)
-                            if len(entity_encodings) != len(prior_probs):
-                                raise RuntimeError(
-                                    Errors.E147.format(
-                                        method="predict",
-                                        msg="vectors not of equal length",
+                        elif len(candidates) == 1 and self.threshold is None:
+                            # shortcut for efficiency reasons: take the 1 candidate
+                            final_kb_ids.append(candidates[0].entity_id_)
+                            self._add_activations(
+                                doc_scores=doc_scores,
+                                doc_ents=doc_ents,
+                                scores=[1.0],
+                                ents=[candidates[0].entity_id],
+                            )
+                        else:
+                            random.shuffle(candidates)
+                            # set all prior probabilities to 0 if incl_prior=False
+                            if self.incl_prior and self.kb.supports_prior_probs:
+                                prior_probs = xp.asarray([c.prior_prob for c in candidates])  # type: ignore
+                            else:
+                                prior_probs = xp.asarray([0.0 for _ in candidates])
+                            scores = prior_probs
+                            # add in similarity from the context
+                            if self.incl_context:
+                                entity_encodings = xp.asarray(
+                                    [c.entity_vector for c in candidates]
+                                )
+                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+                                if len(entity_encodings) != len(prior_probs):
+                                    raise RuntimeError(
+                                        Errors.E147.format(
+                                            method="predict",
+                                            msg="vectors not of equal length",
+                                        )
                                     )
                                 )
-                            # cosine similarity
-                            sims = xp.dot(entity_encodings, sentence_encoding_t) / (
-                                sentence_norm * entity_norm
+                                if sims.shape != prior_probs.shape:
+                                    raise ValueError(Errors.E161)
+                                scores = prior_probs + sims - (prior_probs * sims)
+                            final_kb_ids.append(
+                                candidates[scores.argmax().item()].entity_id_
+                                if self.threshold is None
+                                or scores.max() >= self.threshold
+                                else EntityLinker.NIL
+                            )
+                            self._add_activations(
+                                doc_scores=doc_scores,
+                                doc_ents=doc_ents,
+                                scores=scores,
+                                ents=[c.entity_id for c in candidates],
                             )
                             if sims.shape != prior_probs.shape:
                                 raise ValueError(Errors.E161)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 51ec3255a09..583b9d9b6ce 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -7,9 +7,10 @@
 from spacy import Language, registry, util
 from spacy.attrs import ENT_KB_ID
 from spacy.compat import pickle
-from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates
+from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
 from spacy.lang.en import English
 from spacy.ml import load_kb
+from spacy.ml.models.entity_linker import build_span_maker, get_candidates
 from spacy.pipeline import EntityLinker, TrainablePipe
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
@@ -452,16 +453,17 @@ def test_candidate_generation(nlp):
     mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
     # test the size of the relevant candidates
+    adam_ent_cands = get_candidates(mykb, adam_ent)
     assert len(get_candidates(mykb, douglas_ent)) == 2
-    assert len(get_candidates(mykb, adam_ent)) == 1
+    assert len(adam_ent_cands) == 1
     assert len(get_candidates(mykb, Adam_ent)) == 0  # default case sensitive
     assert len(get_candidates(mykb, shrubbery_ent)) == 0
 
     # test the content of the candidates
-    assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2"
-    assert get_candidates(mykb, adam_ent)[0].alias_ == "adam"
-    assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12)
-    assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
+    assert adam_ent_cands[0].entity_id_ == "Q2"
+    assert adam_ent_cands[0].alias == "adam"
+    assert_almost_equal(adam_ent_cands[0].entity_freq, 12)
+    assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9)
 
 
 def test_el_pipe_configuration(nlp):
@@ -489,7 +491,7 @@ def create_kb(vocab):
     assert doc[2].ent_kb_id_ == "Q2"
 
     def get_lowercased_candidates(kb, span):
-        return kb.get_alias_candidates(span.text.lower())
+        return kb._get_alias_candidates(span.text.lower())
 
     def get_lowercased_candidates_batch(kb, spans):
         return [get_lowercased_candidates(kb, span) for span in spans]
@@ -548,24 +550,22 @@ def test_vocab_serialization(nlp):
     mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
     adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
-    candidates = mykb.get_alias_candidates("adam")
+    candidates = mykb._get_alias_candidates("adam")
     assert len(candidates) == 1
-    assert candidates[0].entity == q2_hash
-    assert candidates[0].entity_ == "Q2"
-    assert candidates[0].alias == adam_hash
-    assert candidates[0].alias_ == "adam"
+    assert candidates[0].entity_id == q2_hash
+    assert candidates[0].entity_id_ == "Q2"
+    assert candidates[0].alias == "adam"
 
     with make_tempdir() as d:
         mykb.to_disk(d / "kb")
         kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
         kb_new_vocab.from_disk(d / "kb")
 
-        candidates = kb_new_vocab.get_alias_candidates("adam")
+        candidates = kb_new_vocab._get_alias_candidates("adam")
         assert len(candidates) == 1
-        assert candidates[0].entity == q2_hash
-        assert candidates[0].entity_ == "Q2"
-        assert candidates[0].alias == adam_hash
-        assert candidates[0].alias_ == "adam"
+        assert candidates[0].entity_id == q2_hash
+        assert candidates[0].entity_id_ == "Q2"
+        assert candidates[0].alias == "adam"
 
         assert kb_new_vocab.get_vector("Q2") == [2]
         assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
@@ -585,20 +585,20 @@ def test_append_alias(nlp):
     mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
     # test the size of the relevant candidates
-    assert len(mykb.get_alias_candidates("douglas")) == 2
+    assert len(mykb._get_alias_candidates("douglas")) == 2
 
     # append an alias
     mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
 
     # test the size of the relevant candidates has been incremented
-    assert len(mykb.get_alias_candidates("douglas")) == 3
+    assert len(mykb._get_alias_candidates("douglas")) == 3
 
     # append the same alias-entity pair again should not work (will throw a warning)
     with pytest.warns(UserWarning):
         mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
 
     # test the size of the relevant candidates remained unchanged
-    assert len(mykb.get_alias_candidates("douglas")) == 3
+    assert len(mykb._get_alias_candidates("douglas")) == 3
 
 
 @pytest.mark.filterwarnings("ignore:\\[W036")
@@ -998,11 +998,11 @@ def test_kb_to_bytes():
     assert kb_2.contains_alias("Russ Cochran")
     assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
     assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
-    assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
-        kb_2.get_alias_candidates("Russ Cochran")
+    assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
+        kb_2._get_alias_candidates("Russ Cochran")
     )
-    assert len(kb_1.get_alias_candidates("Randomness")) == len(
-        kb_2.get_alias_candidates("Randomness")
+    assert len(kb_1._get_alias_candidates("Randomness")) == len(
+        kb_2._get_alias_candidates("Randomness")
     )
 
 
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 99eb8cd8694..b6bad3c46ee 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -66,19 +66,21 @@ def _check_kb(kb):
         assert alias_string not in kb.get_alias_strings()
 
     # check candidates & probabilities
-    candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
+    candidates = sorted(
+        kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_
+    )
     assert len(candidates) == 2
 
-    assert candidates[0].entity_ == "Q007"
+    assert candidates[0].entity_id_ == "Q007"
     assert 6.999 < candidates[0].entity_freq < 7.01
     assert candidates[0].entity_vector == [0, 0, 7]
-    assert candidates[0].alias_ == "double07"
+    assert candidates[0].alias == "double07"
     assert 0.899 < candidates[0].prior_prob < 0.901
 
-    assert candidates[1].entity_ == "Q17"
+    assert candidates[1].entity_id_ == "Q17"
     assert 1.99 < candidates[1].entity_freq < 2.01
     assert candidates[1].entity_vector == [7, 1, 0]
-    assert candidates[1].alias_ == "double07"
+    assert candidates[1].alias == "double07"
     assert 0.099 < candidates[1].prior_prob < 0.101
 
 
diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx
index 15b1d3bf29c..4621d883810 100644
--- a/website/docs/api/inmemorylookupkb.mdx
+++ b/website/docs/api/inmemorylookupkb.mdx
@@ -10,9 +10,9 @@ version: 3.5
 
 The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
 implements all of its methods. It stores all KB data in-memory and generates
-[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with
-entity names. It's highly optimized for both a low memory footprint and speed of
-retrieval.
+[`InMemoryCandidate`](/api/kb#candidate) objects by exactly matching mentions
+with entity names. It's highly optimized for both a low memory footprint and
+speed of retrieval.
 
 ## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"}
 
@@ -156,7 +156,7 @@ Get a list of all aliases in the knowledge base.
 ## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"}
 
 Given a certain textual mention as input, retrieve a list of candidate entities
-of type [`Candidate`](/api/kb#candidate). Wraps
+of type [`InMemoryCandidate`](/api/kb#candidate). Wraps
 [`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
 
 > #### Example
@@ -168,10 +168,10 @@ of type [`Candidate`](/api/kb#candidate). Wraps
 > candidates = kb.get_candidates(doc[0:2])
 > ```
 
-| Name        | Description                                                          |
-| ----------- | -------------------------------------------------------------------- |
-| `mention`   | The textual mention or alias. ~~Span~~                               |
-| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
+| Name        | Description                                                                          |
+| ----------- | ------------------------------------------------------------------------------------ |
+| `mention`   | The textual mention or alias. ~~Span~~                                               |
+| **RETURNS** | An iterable of relevant `InMemoryCandidate` objects. ~~Iterable[InMemoryCandidate]~~ |
 
 ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
 
@@ -194,26 +194,10 @@ to you.
 > candidates = kb.get_candidates((doc[0:2], doc[3:]))
 > ```
 
-| Name        | Description                                                                                  |
-| ----------- | -------------------------------------------------------------------------------------------- |
-| `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
-| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
-
-## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"}
-
-Given a certain textual mention as input, retrieve a list of candidate entities
-of type [`Candidate`](/api/kb#candidate).
-
-> #### Example
->
-> ```python
-> candidates = kb.get_alias_candidates("Douglas")
-> ```
-
-| Name        | Description                                                   |
-| ----------- | ------------------------------------------------------------- |
-| `alias`     | The textual mention or alias. ~~str~~                         |
-| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
+| Name        | Description                                                                                                  |
+| ----------- | ------------------------------------------------------------------------------------------------------------ |
+| `mentions`  | The textual mentions. ~~Iterable[Span]~~                                                                     |
+| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
 
 ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
 
diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx
index 2b0d4d9d6b3..9536a3fe375 100644
--- a/website/docs/api/kb.mdx
+++ b/website/docs/api/kb.mdx
@@ -103,23 +103,6 @@ to you.
 | `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
 | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
 
-## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"}
-
-<Infobox variant="warning">
-  This method is _not_ available from spaCy 3.5 onwards.
-</Infobox>
-
-From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
-[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
-allow more flexibility in customizing knowledge bases. Some of its methods were
-moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
-one of those being `get_alias_candidates()`. This method is now available as
-[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
-Note:
-[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
-defaults to
-[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
-
 ## KnowledgeBase.get_vector {id="get_vector",tag="method"}
 
 Given a certain entity ID, retrieve its pretrained entity vector.
@@ -190,25 +173,27 @@ Restore the state of the knowledge base from a given directory. Note that the
 | `exclude`   | List of components to exclude. ~~Iterable[str]~~                                                |
 | **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~                                          |
 
-## Candidate {id="candidate",tag="class"}
+## InMemoryCandidate {id="candidate",tag="class"}
 
-A `Candidate` object refers to a textual mention (alias) that may or may not be
-resolved to a specific entity from a `KnowledgeBase`. This will be used as input
-for the entity linking algorithm which will disambiguate the various candidates
-to the correct one. Each candidate `(alias, entity)` pair is assigned to a
-certain prior probability.
+An `InMemoryCandidate` object refers to a textual mention (alias) that may or
+may not be resolved to a specific entity from a `KnowledgeBase`. This will be
+used as input for the entity linking algorithm which will disambiguate the
+various candidates to the correct one. Each candidate `(alias, entity)` pair is
+assigned to a certain prior probability.
 
-### Candidate.\_\_init\_\_ {id="candidate-init",tag="method"}
+### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
 
-Construct a `Candidate` object. Usually this constructor is not called directly,
-but instead these objects are returned by the `get_candidates` method of the
-[`entity_linker`](/api/entitylinker) pipe.
+Construct an `InMemoryCandidate` object. Usually this constructor is not called
+directly, but instead these objects are returned by the `get_candidates` method
+of the [`entity_linker`](/api/entitylinker) pipe.
 
-> #### Example
+> #### Example```python
+>
+> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
+> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
+>
+> ```
 >
-> ```python
-> from spacy.kb import Candidate
-> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
 > ```
 
 | Name          | Description                                                               |
@@ -216,10 +201,10 @@ but instead these objects are returned by the `get_candidates` method of the
 | `kb`          | The knowledge base that defined this candidate. ~~KnowledgeBase~~         |
 | `entity_hash` | The hash of the entity's KB ID. ~~int~~                                   |
 | `entity_freq` | The entity frequency as recorded in the KB. ~~float~~                     |
-| `alias_hash`  | The hash of the textual mention or alias. ~~int~~                         |
+| `alias_hash`  | The hash of the entity alias. ~~int~~                                     |
 | `prior_prob`  | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
 
-## Candidate attributes {id="candidate-attributes"}
+## InMemoryCandidate attributes {id="candidate-attributes"}
 
 | Name            | Description                                                              |
 | --------------- | ------------------------------------------------------------------------ |

From 685ee18632222f7c1034b5ab9f2e9d86aa0a35af Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 20 Mar 2023 12:25:18 +0100
Subject: [PATCH 286/504] Entity linking: use `SpanGroup` instead of
 `Iterable[Span]` for mentions (#12344)

* Convert Candidate from Cython to Python class.

* Format.

* Fix .entity_ typo in _add_activations() usage.

* Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span].

* Update docs.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update doc string of BaseCandidate.__init__().

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate.

* Adjust Candidate to support and mandate numerical entity IDs.

* Format.

* Fix docstring and docs.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename alias -> mention.

* Refactor Candidate attribute names. Update docs and tests accordingly.

* Refacor Candidate attributes and their usage.

* Format.

* Fix mypy error.

* Update error code in line with v4 convention.

* Reverse erroneous changes during merge.

* Update return type in EL tests.

* Re-add Candidate to setup.py.

* Format updated docs.

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/kb/__init__.py                       |  1 -
 spacy/kb/kb.pyx                            | 10 ++---
 spacy/ml/models/entity_linker.py           |  8 ++--
 spacy/pipeline/entity_linker.py            | 45 ++++++++++------------
 spacy/tests/pipeline/test_entity_linker.py |  1 -
 website/docs/api/inmemorylookupkb.mdx      |  5 ++-
 website/docs/api/kb.mdx                    | 11 +++---
 7 files changed, 37 insertions(+), 44 deletions(-)

diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
index fb21083ddee..7155c15df9a 100644
--- a/spacy/kb/__init__.py
+++ b/spacy/kb/__init__.py
@@ -3,5 +3,4 @@
 from .kb_in_memory import InMemoryLookupKB
 from .candidate import Candidate, InMemoryCandidate
 
-
 __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index 24cee30ffc7..bb58bf88a46 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -5,8 +5,8 @@ from typing import Iterable, Tuple, Union
 
 from cymem.cymem cimport Pool
 
-from ..errors import Errors
-from ..tokens import Span
+from .candidate import Candidate
+from ..tokens import Span, SpanGroup
 from ..util import SimpleFrozenList
 from .candidate import Candidate
 
@@ -32,15 +32,13 @@ cdef class KnowledgeBase:
         self.entity_vector_length = entity_vector_length
         self.mem = Pool()
 
-    def get_candidates_batch(
-        self, mentions: Iterable[Span]
-    ) -> Iterable[Iterable[Candidate]]:
+    def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]:
         """
         Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
         entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
         probability of the specified mention text resolving to that entity - might be included.
         If no candidates are found for a given mention, an empty list is returned.
-        mentions (Iterable[Span]): Mentions for which to get candidates.
+        mentions (SpanGroup): Mentions for which to get candidates.
         RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
         """
         return [self.get_candidates(span) for span in mentions]
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 99522c4617c..db960fbd0a9 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -18,7 +18,7 @@
 from ...kb import KnowledgeBase, InMemoryLookupKB
 from ...kb import Candidate
 from ...vocab import Vocab
-from ...tokens import Span, Doc
+from ...tokens import Doc, Span, SpanGroup
 from ..extract_spans import extract_spans
 from ...errors import Errors
 from ...kb import (
@@ -135,7 +135,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
 
 @registry.misc("spacy.CandidateBatchGenerator.v1")
 def create_candidates_batch() -> Callable[
-    [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+    [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
 ]:
     return get_candidates_batch
 
@@ -151,12 +151,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
 
 
 def get_candidates_batch(
-    kb: KnowledgeBase, mentions: Iterable[Span]
+    kb: KnowledgeBase, mentions: SpanGroup
 ) -> Iterable[Iterable[Candidate]]:
     """
     Return candidate entities for the given mentions and fetching appropriate entries from the index.
     kb (KnowledgeBase): Knowledge base to query.
-    mentions (Iterable[Span]): Entity mentions for which to identify candidates.
+    mentions (SpanGroup): Entity mentions for which to identify candidates.
     RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
     """
     return kb.get_candidates_batch(mentions)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 29db232fcee..a1007abbffe 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -16,6 +16,8 @@
 
 from ..kb import KnowledgeBase, Candidate
 from ..tokens import Doc, Span
+from ..ml import empty_kb
+from ..tokens import Doc, Span, SpanGroup
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 from ..language import Language
@@ -96,7 +98,7 @@ def make_entity_linker(
     entity_vector_length: int,
     get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
     get_candidates_batch: Callable[
-        [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+        [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
     ],
     generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
     overwrite: bool,
@@ -119,7 +121,7 @@ def make_entity_linker(
     get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
         produces a list of candidates, given a certain knowledge base and a textual mention.
     get_candidates_batch (
-        Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
+        Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
         ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
     generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
     scorer (Optional[Callable]): The scoring method.
@@ -184,7 +186,7 @@ def __init__(
         entity_vector_length: int,
         get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
         get_candidates_batch: Callable[
-            [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+            [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
         ],
         overwrite: bool = False,
         scorer: Optional[Callable] = entity_linker_score,
@@ -207,7 +209,7 @@ def __init__(
         get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
             produces a list of candidates, given a certain knowledge base and a textual mention.
         get_candidates_batch (
-            Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
+            Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]],
             Iterable[Candidate]]
             ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
         overwrite (bool): Whether to overwrite existing non-empty annotations.
@@ -487,26 +489,21 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                 continue
             sentences = [s for s in doc.sents]
 
-                if self.incl_context:
-                    # get n_neighbour sentences, clipped to the length of the document
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    sent_doc = doc[start_token:end_token].as_doc()
-                    # currently, the context is the same for each entity in a sentence (should be refined)
-                    sentence_encoding = self.model.predict([sent_doc])[0]
-                    sentence_encoding_t = sentence_encoding.T
-                    sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                entity_count += 1
-                if ent.label_ in self.labels_discard:
-                    # ignoring this entity - setting to NIL
-                    final_kb_ids.append(self.NIL)
-                    self._add_activations(
-                        doc_scores=doc_scores,
-                        doc_ents=doc_ents,
-                        scores=[0.0],
-                        ents=[0],
+            # Loop over entities in batches.
+            for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
+                ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
+
+                # Look up candidate entities.
+                valid_ent_idx = [
+                    idx
+                    for idx in range(len(ent_batch))
+                    if ent_batch[idx].label_ not in self.labels_discard
+                ]
+
+                batch_candidates = list(
+                    self.get_candidates_batch(
+                        self.kb,
+                        SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
                     )
                 else:
                     candidates = list(self.get_candidates(self.kb, ent))
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 583b9d9b6ce..7b597424a34 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1088,7 +1088,6 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
-
     # Ensure that the legacy architectures still work
     vector_length = 3
     nlp = English()
diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx
index 4621d883810..712cce30747 100644
--- a/website/docs/api/inmemorylookupkb.mdx
+++ b/website/docs/api/inmemorylookupkb.mdx
@@ -189,14 +189,15 @@ to you.
 >
 > ```python
 > from spacy.lang.en import English
+> from spacy.tokens import SpanGroup
 > nlp = English()
 > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
-> candidates = kb.get_candidates((doc[0:2], doc[3:]))
+> candidates = kb.get_candidates_batch([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
 > ```
 
 | Name        | Description                                                                                                  |
 | ----------- | ------------------------------------------------------------------------------------------------------------ |
-| `mentions`  | The textual mentions. ~~Iterable[Span]~~                                                                     |
+| `mentions`  | The textual mentions. ~~SpanGroup~~                                                                          |
 | **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
 
 ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx
index 9536a3fe375..94506162f27 100644
--- a/website/docs/api/kb.mdx
+++ b/website/docs/api/kb.mdx
@@ -93,14 +93,15 @@ to you.
 >
 > ```python
 > from spacy.lang.en import English
+> from spacy.tokens import SpanGroup
 > nlp = English()
 > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
-> candidates = kb.get_candidates((doc[0:2], doc[3:]))
+> candidates = kb.get_candidates([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
 > ```
 
 | Name        | Description                                                                                  |
 | ----------- | -------------------------------------------------------------------------------------------- |
-| `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
+| `mentions`  | The textual mentions. ~~SpanGroup~~                                                          |
 | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
 
 ## KnowledgeBase.get_vector {id="get_vector",tag="method"}
@@ -187,13 +188,11 @@ Construct an `InMemoryCandidate` object. Usually this constructor is not called
 directly, but instead these objects are returned by the `get_candidates` method
 of the [`entity_linker`](/api/entitylinker) pipe.
 
-> #### Example```python
+> #### Example
 >
+> ```python
 > from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
 > entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
->
-> ```
->
 > ```
 
 | Name          | Description                                                               |

From d80d19f4b389f29000cfa718ffbe69a115a28936 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 27 Mar 2023 09:18:23 +0200
Subject: [PATCH 287/504] Add info that Vocab and StringStore are not static in
 docs (#12427)

* Add size increase info about vocab and stringstore

* Update website/docs/api/stringstore.mdx

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>

* Update website/docs/api/vocab.mdx

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>

* Change wording

---------

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 website/docs/api/stringstore.mdx | 2 +-
 website/docs/api/vocab.mdx       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index 269ac2d0c4b..1b1f3bd5352 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -8,7 +8,7 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
 integer IDs. This ensures that strings always map to the same ID, even from
 different `StringStores`.
 
-<Infobox variant ="warning">
+<Infobox variant="warning">
 
 Note that a `StringStore` instance is not static. It increases in size as texts
 with new tokens are processed.
diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx
index 88d3939142f..319ce88b8dc 100644
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@@ -10,7 +10,7 @@ The `Vocab` object provides a lookup table that allows you to access
 [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
 between `Doc` objects.
 
-<Infobox variant ="warning">
+<Infobox variant="warning">
 
 Note that a `Vocab` instance is not static. It increases in size as texts with
 new tokens are processed. Some models may have an empty vocab at initialization.

From 4e7b35614f5819c42b4b9ef4405ce365bd8c46ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 30 Mar 2023 09:30:42 +0200
Subject: [PATCH 288/504] Avoid `TrainablePipe.finish_update` getting called
 twice during training (#12450)

* Avoid `TrainablePipe.finish_update` getting called twice during training

PR #12136 fixed an issue where the tok2vec pipe was updated before
gradient were accumulated. However, it introduced a new bug that cause
`finish_update` to be called twice when using the training loop. This
causes a fairly large slowdown.

The `Language.update` method accepts the `sgd` argument for passing an
optimizer. This argument has three possible values:

- `Optimizer`: use the given optimizer to finish pipe updates.
- `None`: use a default optimizer to finish pipe updates.
- `False`: do not finish pipe updates.

However, the latter option was not documented and not valid with the
existing type of `sgd`. I assumed that this was a remnant of earlier
spaCy versions and removed handling of `False`.

However, with that change, we are passing `None` to `Language.update`.
As a result, we were calling `finish_update` in both `Language.update`
and in the training loop after all subbatches are processed.

This change restores proper handling/use of `False`. Moreover, the role
of `False` is now documented and added to the type to avoid future
accidents.

* Fix typo

* Document defaults for `Language.update`
---
 spacy/language.py             |  7 +++++--
 spacy/tests/test_language.py  | 18 ++++++++++++++++++
 spacy/training/loop.py        |  2 +-
 website/docs/api/language.mdx | 18 +++++++++---------
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index e8a7d719ef2..b8c4322d3b4 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1230,7 +1230,7 @@ def update(
         _: Optional[Any] = None,
         *,
         drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
+        sgd: Union[Optimizer, None, Literal[False]] = None,
         losses: Optional[Dict[str, float]] = None,
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
         exclude: Iterable[str] = SimpleFrozenList(),
@@ -1241,7 +1241,9 @@ def update(
         examples (Iterable[Example]): A batch of examples
         _: Should not be set - serves to catch backwards-incompatible scripts.
         drop (float): The dropout rate.
-        sgd (Optimizer): An optimizer.
+        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
+            be created via create_optimizer if 'None'. No optimizer will
+            be used when set to 'False'.
         losses (Dict[str, float]): Dictionary to update with the loss, keyed by
             component.
         component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
@@ -1300,6 +1302,7 @@ def update(
                 name not in exclude
                 and isinstance(proc, ty.TrainableComponent)
                 and proc.is_trainable
+                and sgd not in (None, False)
             ):
                 proc.finish_update(sgd)
 
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 88ef3d434c0..e4b06893c93 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -162,6 +162,24 @@ def test_language_update_updates():
     )
 
 
+def test_language_update_does_not_update_with_sgd_false():
+    config = Config().from_str(TAGGER_CFG_STRING)
+    nlp = load_model_from_config(config, auto_fill=True, validate=True)
+
+    train_examples = []
+    for t in TAGGER_TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+    nlp.update(train_examples, sgd=False)
+    docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+
+    xp = get_array_module(docs_after_update[0].tensor)
+    xp.testing.assert_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
+
+
 def test_language_evaluate(nlp):
     text = "hello world"
     annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index e6b3451cd73..9497b95aba5 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -221,7 +221,7 @@ def train_while_improving(
                 subbatch,
                 drop=dropout,
                 losses=losses,
-                sgd=None,
+                sgd=False,
                 exclude=exclude,
                 annotates=annotating_components,
             )
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 2a1f7a1a961..e38e49bf569 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -323,15 +323,15 @@ and custom registered functions if needed. See the
 >     nlp.update([example], sgd=optimizer)
 > ```
 
-| Name            | Description                                                                                                                                    |
-| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`      | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                              |
-| _keyword-only_  |                                                                                                                                                |
-| `drop`          | The dropout rate. ~~float~~                                                                                                                    |
-| `sgd`           | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                  |
-| `losses`        | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                |
-| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
-| **RETURNS**     | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                          |
+| Name            | Description                                                                                                                                                                                        |
+| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`      | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                                                                                  |
+| _keyword-only_  |                                                                                                                                                                                                    |
+| `drop`          | The dropout rate. Defaults to `0.0`. ~~float~~                                                                                                                                                     |
+| `sgd`           | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ |
+| `losses`        | Dictionary to update with the loss, keyed by pipeline component. Defaults to `None`. ~~Optional[Dict[str, float]]~~                                                                                |
+| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                                     |
+| **RETURNS**     | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                              |
 
 ## Language.distill {id="distill",tag="method,experimental",version="4"}
 

From d8a8206f3b2e38593ce44a03f3aaf6753c2b0c11 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 6 Apr 2023 16:01:59 +0200
Subject: [PATCH 289/504] Enforce that Span.start/end(_char) remain valid and
 in sync (#12268)

* Enforce that Span.start/end(_char) remain valid and in sync

Allowing span attributes to be writable starting in v3 has made it
possible for the internal `Span.start/end/start_char/end_char` to get
out-of-sync or have invalid values.

This checks that the values are valid and syncs the token and char
offsets if any attributes are modified directly. It does not yet handle
the case where the underlying doc is modified.

* Format
---
 spacy/errors.py              |  5 +++-
 spacy/tests/doc/test_span.py | 47 ++++++++++++++++++++++++++++++++++
 spacy/tokens/span.pyx        | 49 +++++++++++++++++++++++++++---------
 3 files changed, 88 insertions(+), 13 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 42fdc12e029..fe067f7915d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -932,7 +932,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E1029 = ("Edit tree cannot be applied to form.")
     E1030 = ("Edit tree identifier out of range.")
     E1031 = ("Could not find gold transition - see logs above.")
-    E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
+    E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.")
     E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
     E1034 = ("Node index {i} out of bounds ({length})")
     E1035 = ("Token index {i} out of bounds ({length})")
@@ -986,6 +986,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E4004 = ("Backprop is not supported when is_train is not set.")
     E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
     E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
+    E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
+             "{existing_value}.")
+    E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
 
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 74874624888..0b05ca7c123 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -696,3 +696,50 @@ def test_span_ent_id(en_tokenizer):
     doc.ents = [span]
     assert doc.ents[0].ent_id_ == "ID2"
     assert doc[1].ent_id_ == "ID2"
+
+
+def test_span_start_end_sync(en_tokenizer):
+    doc = en_tokenizer("a bc def e fghij kl")
+    # can create and edit span starts/ends
+    span = doc[2:4]
+    span.start_char = 2
+    span.end = 5
+    assert span == doc[span.start : span.end]
+    assert span == doc.char_span(span.start_char, span.end_char)
+    # cannot set completely out of bounds starts/ends
+    with pytest.raises(IndexError):
+        span.start = -1
+    with pytest.raises(IndexError):
+        span.end = -1
+    with pytest.raises(IndexError):
+        span.start_char = len(doc.text) + 1
+    with pytest.raises(IndexError):
+        span.end = len(doc.text) + 1
+    # test all possible char starts/ends
+    span = doc[0 : len(doc)]
+    token_char_starts = [token.idx for token in doc]
+    token_char_ends = [token.idx + len(token.text) for token in doc]
+    for i in range(len(doc.text)):
+        if i not in token_char_starts:
+            with pytest.raises(ValueError):
+                span.start_char = i
+        else:
+            span.start_char = i
+    span = doc[0 : len(doc)]
+    for i in range(len(doc.text)):
+        if i not in token_char_ends:
+            with pytest.raises(ValueError):
+                span.end_char = i
+        else:
+            span.end_char = i
+    # start must be <= end
+    span = doc[1:3]
+    with pytest.raises(ValueError):
+        span.start = 4
+    with pytest.raises(ValueError):
+        span.end = 0
+    span = doc.char_span(2, 8)
+    with pytest.raises(ValueError):
+        span.start_char = 9
+    with pytest.raises(ValueError):
+        span.end_char = 1
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index c9cef2bcdaa..8662481ac78 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -800,36 +800,61 @@ cdef class Span:
             return self.span_c().start
 
         def __set__(self, int start):
-            if start < 0:
-                raise IndexError("TODO")
-            self.span_c().start = start
+            if start < 0 or start > self.doc.length:
+                raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
+            cdef SpanC* span_c = self.span_c()
+            if start > span_c.end:
+                raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
+            span_c.start = start
+            span_c.start_char = self.doc.c[start].idx
 
     property end:
         def __get__(self):
             return self.span_c().end
 
         def __set__(self, int end):
-            if end < 0:
-                raise IndexError("TODO")
-            self.span_c().end = end
+            if end < 0 or end > self.doc.length:
+                raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
+            cdef SpanC* span_c = self.span_c()
+            if span_c.start > end:
+                raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
+            span_c.end = end
+            if end > 0:
+                span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length
+            else:
+                span_c.end_char = 0
 
     property start_char:
         def __get__(self):
             return self.span_c().start_char
 
         def __set__(self, int start_char):
-            if start_char < 0:
-                raise IndexError("TODO")
-            self.span_c().start_char = start_char
+            if start_char < 0 or start_char > len(self.doc.text):
+                raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
+            cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
+            if start < 0:
+                raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
+            cdef SpanC* span_c = self.span_c()
+            if start_char > span_c.end_char:
+                raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char))
+            span_c.start_char = start_char
+            span_c.start = start
 
     property end_char:
         def __get__(self):
             return self.span_c().end_char
 
         def __set__(self, int end_char):
-            if end_char < 0:
-                raise IndexError("TODO")
-            self.span_c().end_char = end_char
+            if end_char < 0 or end_char > len(self.doc.text):
+                raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
+            cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
+            if end < 0:
+                raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
+            cdef SpanC* span_c = self.span_c()
+            if span_c.start_char > end_char:
+                raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char))
+            span_c.end_char = end_char
+            span_c.end = end
 
     property label:
         def __get__(self):

From 2d9b99844525465021022e5a36e66206bf52dfa7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 21 Apr 2023 13:49:40 +0200
Subject: [PATCH 290/504] Add distillation loop (#12542)

* Add distillation initialization and loop

* Fix up configuration keys

* Add docstring

* Type annotations

* init_nlp_distill -> init_nlp_student

* Do not resolve dot name distill corpus in initialization

(Since we don't use it.)

* student: do not request use of optimizer in student pipe

We apply finish up the updates once in the training loop instead.

Also add the necessary logic to `Language.distill` to mirror
`Language.update`.

* Correctly determine sort key in subdivide_batch

* Fix _distill_loop docstring wrt. stopping condition

* _distill_loop: fix distill_data docstring

Make similar changes in train_while_improving, since it also had
incorrect types and missing type annotations.

* Move `set_{gpu_allocator,seed}_from_config` to spacy.util

* Update Language.update docs for the sgd argument

* Type annotation

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/language.py                 |  26 ++-
 spacy/schemas.py                  |   2 +-
 spacy/tests/training/test_loop.py | 111 +++++++++++
 spacy/training/initialize.py      | 134 ++++++++++---
 spacy/training/loop.py            | 317 +++++++++++++++++++++++++++---
 spacy/util.py                     |  20 ++
 website/docs/api/language.mdx     |  26 +--
 7 files changed, 560 insertions(+), 76 deletions(-)
 create mode 100644 spacy/tests/training/test_loop.py

diff --git a/spacy/language.py b/spacy/language.py
index b8c4322d3b4..028f733200e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1052,7 +1052,7 @@ def distill(
         examples: Iterable[Example],
         *,
         drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
+        sgd: Union[Optimizer, None, Literal[False]] = None,
         losses: Optional[Dict[str, float]] = None,
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
         exclude: Iterable[str] = SimpleFrozenList(),
@@ -1065,7 +1065,9 @@ def distill(
             (teacher) and predicted (student) docs must have the same number of
             tokens and the same orthography.
         drop (float): The dropout rate.
-        sgd (Optional[Optimizer]): An optimizer.
+        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
+            be created via create_optimizer if 'None'. No optimizer will
+            be used when set to 'False'.
         losses (Optional(Dict[str, float])): Dictionary to update with the loss,
             keyed by component.
         component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
@@ -1135,11 +1137,23 @@ def distill(
                 student_proc.distill(
                     teacher_pipe,
                     examples,
-                    sgd=sgd,
+                    sgd=None,
                     losses=losses,
                     **component_cfg[student_name],
                 )
 
+        # Only finish the update after all component updates are done. Some
+        # components may share weights (such as tok2vec) and we only want
+        # to apply weight updates after all gradients are accumulated.
+        for student_name, student_proc in self.pipeline:
+            if (
+                student_name not in exclude
+                and isinstance(student_proc, ty.DistillableComponent)
+                and student_proc.is_distillable
+                and sgd not in (None, False)
+            ):
+                student_proc.finish_update(sgd)
+
         return losses
 
     def disable_pipes(self, *names) -> "DisabledPipes":
@@ -1908,7 +1922,7 @@ def from_config(
         # using the nlp.config with all defaults.
         config = util.copy_config(config)
         orig_pipeline = config.pop("components", {})
-        orig_distill = config.pop("distill", None)
+        orig_distill = config.pop("distillation", None)
         orig_pretraining = config.pop("pretraining", None)
         config["components"] = {}
         if auto_fill:
@@ -1918,8 +1932,8 @@ def from_config(
         filled["components"] = orig_pipeline
         config["components"] = orig_pipeline
         if orig_distill is not None:
-            filled["distill"] = orig_distill
-            config["distill"] = orig_distill
+            filled["distillation"] = orig_distill
+            config["distillation"] = orig_distill
         if orig_pretraining is not None:
             filled["pretraining"] = orig_pretraining
             config["pretraining"] = orig_pretraining
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 32fb042b5a0..7fc5ec20e51 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -510,7 +510,7 @@ class Config:
     "training": ConfigSchemaTraining,
     "pretraining": ConfigSchemaPretrain,
     "initialize": ConfigSchemaInit,
-    "distill": ConfigSchemaDistill,
+    "distillation": ConfigSchemaDistill,
 }
 
 # Recommendations for init config workflows
diff --git a/spacy/tests/training/test_loop.py b/spacy/tests/training/test_loop.py
new file mode 100644
index 00000000000..46d01509504
--- /dev/null
+++ b/spacy/tests/training/test_loop.py
@@ -0,0 +1,111 @@
+from typing import Callable, Iterable, Iterator
+import pytest
+from spacy import Language
+from spacy.training import Example
+from spacy.training.initialize import init_nlp_student
+from spacy.training.loop import distill, train
+from spacy.util import load_model_from_config, registry
+from thinc.api import Config
+
+
+@pytest.fixture
+def config_str():
+    return """
+    [nlp]
+    lang = "en"
+    pipeline = ["senter"]
+    disabled = []
+    before_creation = null
+    after_creation = null
+    after_pipeline_creation = null
+    batch_size = 1000
+    tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+    [components]
+
+    [components.senter]
+    factory = "senter"
+
+    [training]
+    dev_corpus = "corpora.dev"
+    train_corpus = "corpora.train"
+    max_steps = 50
+    seed = 1
+    gpu_allocator = null
+
+    [distillation]
+    corpus = "corpora.train"
+    dropout = 0.1
+    max_epochs = 0
+    max_steps = 50
+    student_to_teacher = {}
+
+    [distillation.batcher]
+    @batchers = "spacy.batch_by_words.v1"
+    size = 3000
+    discard_oversize = false
+    tolerance = 0.2
+
+    [distillation.optimizer]
+    @optimizers = "Adam.v1"
+    beta1 = 0.9
+    beta2 = 0.999
+    L2_is_weight_decay = true
+    L2 = 0.01
+    grad_clip = 1.0
+    use_averages = true
+    eps = 1e-8
+    learn_rate = 1e-4
+
+    [corpora]
+
+    [corpora.dev]
+    @readers = "sentence_corpus"
+
+    [corpora.train]
+    @readers = "sentence_corpus"
+    """
+
+
+SENT_STARTS = [0] * 14
+SENT_STARTS[0] = 1
+SENT_STARTS[5] = 1
+SENT_STARTS[9] = 1
+
+TRAIN_DATA = [
+    (
+        "I like green eggs. Eat blue ham. I like purple eggs.",
+        {"sent_starts": SENT_STARTS},
+    ),
+    (
+        "She likes purple eggs. They hate ham. You like yellow eggs.",
+        {"sent_starts": SENT_STARTS},
+    ),
+]
+
+
+@pytest.mark.slow
+def test_distill_loop(config_str):
+    @registry.readers("sentence_corpus")
+    def create_sentence_corpus() -> Callable[[Language], Iterable[Example]]:
+        return SentenceCorpus()
+
+    class SentenceCorpus:
+        def __call__(self, nlp: Language) -> Iterator[Example]:
+            for t in TRAIN_DATA:
+                yield Example.from_dict(nlp.make_doc(t[0]), t[1])
+
+    orig_config = Config().from_str(config_str)
+    teacher = load_model_from_config(orig_config, auto_fill=True, validate=True)
+    teacher.initialize()
+    train(teacher)
+
+    orig_config = Config().from_str(config_str)
+    student = init_nlp_student(orig_config, teacher)
+    student.initialize()
+    distill(teacher, student)
+
+    doc = student(TRAIN_DATA[0][0])
+    assert doc.sents[0].text == "I like green eggs."
+    assert doc.sents[1].text == "Eat blue ham."
+    assert doc.sents[2].text == "I like purple eggs."
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 191821e786e..61ad1c09cc0 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -1,3 +1,9 @@
+from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING
+from thinc.api import Config, ConfigValidationError
+from pathlib import Path
+import srsly
+import numpy
+import tarfile
 import gzip
 import tarfile
 import warnings
@@ -12,22 +18,11 @@
 from thinc.api import Config, ConfigValidationError, fix_random_seed, set_gpu_allocator
 
 from ..errors import Errors, Warnings
-from ..lookups import Lookups
-from ..schemas import ConfigSchemaTraining
-from ..util import (
-    DEFAULT_OOV_PROB,
-    OOV_RANK,
-    ensure_path,
-    get_sourced_components,
-    load_model,
-    load_model_from_config,
-    logger,
-    registry,
-    resolve_dot_names,
-)
-from ..vectors import Mode as VectorsMode
-from ..vectors import Vectors
-from .pretrain import get_tok2vec_ref
+from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
+from ..util import registry, load_model_from_config, resolve_dot_names, logger
+from ..util import load_model, ensure_path, get_sourced_components
+from ..util import OOV_RANK, DEFAULT_OOV_PROB
+from ..util import set_gpu_allocator_from_config, set_seed_from_config
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
@@ -36,15 +31,8 @@
 def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     raw_config = config
     config = raw_config.interpolate()
-    if "seed" not in config["training"]:
-        raise ValueError(Errors.E1015.format(value="[training] seed"))
-    if "gpu_allocator" not in config["training"]:
-        raise ValueError(Errors.E1015.format(value="[training] gpu_allocator"))
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
+    set_seed_from_config(config)
+    set_gpu_allocator_from_config(config, use_gpu)
     # Use original config here before it's resolved to functions
     sourced = get_sourced_components(config)
     nlp = load_model_from_config(raw_config, auto_fill=True)
@@ -111,6 +99,102 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     return nlp
 
 
+def init_nlp_student(
+    config: Config, teacher: "Language", *, use_gpu: int = -1
+) -> "Language":
+    """Initialize student pipeline for distillation.
+
+    config (Config): Student model configuration.
+    teacher (Language): The teacher pipeline to distill from.
+    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
+        before calling this function.
+    """
+    raw_config = config
+    config = raw_config.interpolate()
+    set_seed_from_config(config)
+    set_gpu_allocator_from_config(config, use_gpu)
+
+    # Use original config here before it's resolved to functions
+    sourced = get_sourced_components(config)
+    nlp = load_model_from_config(raw_config, auto_fill=True)
+    logger.info("Set up nlp object from config")
+    config = nlp.config.interpolate()
+    # Resolve all training-relevant sections using the filled nlp config
+    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    D = registry.resolve(config["distillation"], schema=ConfigSchemaDistill)
+    dot_names = [T["dev_corpus"]]
+    if not isinstance(D["corpus"], str):
+        raise ConfigValidationError(
+            desc=Errors.E897.format(field="distillation.corpus", type=type(D["corpus"]))
+        )
+    if not isinstance(T["dev_corpus"], str):
+        raise ConfigValidationError(
+            desc=Errors.E897.format(
+                field="training.dev_corpus", type=type(T["dev_corpus"])
+            )
+        )
+    (dev_corpus,) = resolve_dot_names(config, dot_names)
+    optimizer = T["optimizer"]
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Sourced components that require resume_training
+    resume_components = [p for p in sourced if p not in frozen_components]
+    logger.info(f"Pipeline: {nlp.pipe_names}")
+    if resume_components:
+        with nlp.select_pipes(enable=resume_components):
+            logger.info(f"Resuming training for: {resume_components}")
+            nlp.resume_training(sgd=optimizer)
+    # Make sure that listeners are defined before initializing further
+    nlp._link_components()
+
+    # Get teacher labels to initialize student with.
+    student_to_teacher = D["student_to_teacher"]
+    teacher_pipes = dict(teacher.pipeline)
+    labels = {}
+    for name, pipe in nlp.pipeline:
+        # Copy teacher labels.
+        teacher_pipe_name = (
+            student_to_teacher[name] if name in student_to_teacher else name
+        )
+        teacher_pipe = teacher_pipes.get(teacher_pipe_name, None)
+        if (
+            teacher_pipe is not None
+            and getattr(teacher_pipe, "label_data", None) is not None
+        ):
+            labels[name] = teacher_pipe.label_data  # type: ignore[attr-defined]
+
+    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
+        # Initialize on the dev corpus, since the distillation corpus does
+        # usually not have labels. Since we copy the labels from the teacher
+        # pipe, the dev data does not have to be exhaustive.
+        if T["max_epochs"] == -1:
+            sample_size = 100
+            logger.debug(
+                f"Due to streamed train corpus, using only first {sample_size} "
+                f"examples for initialization. If necessary, provide all labels "
+                f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
+            )
+            nlp.initialize(lambda: islice(dev_corpus(nlp), sample_size), sgd=optimizer)
+        else:
+            nlp.initialize(lambda: dev_corpus(nlp), sgd=optimizer, labels=labels)
+        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
+    # Detect components with listeners that are not frozen consistently
+    for name, proc in nlp.pipeline:
+        for listener in getattr(
+            proc, "listening_components", []
+        ):  # e.g. tok2vec/transformer
+            # Don't warn about components not in the pipeline
+            if listener not in nlp.pipe_names:
+                continue
+            if listener in frozen_components and name not in frozen_components:
+                logger.warning(Warnings.W087.format(name=name, listener=listener))
+            # We always check this regardless, in case user freezes tok2vec
+            if listener not in frozen_components and name in frozen_components:
+                if name not in T["annotating_components"]:
+                    logger.warning(Warnings.W086.format(name=name, listener=listener))
+    return nlp
+
+
 def init_vocab(
     nlp: "Language",
     *,
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 9497b95aba5..ad162678fec 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -3,26 +3,20 @@
 import sys
 from pathlib import Path
 from timeit import default_timer as timer
-from typing import (
-    IO,
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Tuple,
-    Union,
-)
-
-from thinc.api import Config, Optimizer, constant, fix_random_seed, set_gpu_allocator
+from thinc.api import Optimizer, Config, constant
 from wasabi import Printer
+import random
+import sys
+import shutil
+
 
-from ..errors import Errors
-from ..schemas import ConfigSchemaTraining
-from ..util import logger, registry, resolve_dot_names
 from .example import Example
+from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
+from ..errors import Errors
+from ..tokens.doc import Doc
+from .. import ty
+from ..util import resolve_dot_names, registry, logger
+from ..util import set_gpu_allocator_from_config, set_seed_from_config
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
@@ -32,6 +26,129 @@
 DIR_MODEL_LAST = "model-last"
 
 
+def distill(
+    teacher: "Language",
+    student: "Language",
+    output_path: Optional[Path] = None,
+    *,
+    use_gpu: int = -1,
+    stdout: IO = sys.stdout,
+    stderr: IO = sys.stderr,
+) -> Tuple["Language", Optional[Path]]:
+    """Distill a student pipeline from a teacher pipeline.
+
+    teacher (Language): The teacher pipeline to distill from.
+    student (Language): The student pipeline to distill into.
+    output_path (Optional[Path]): Optional output path to save the student
+        model to.
+    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
+        before calling this function.
+    stdout (file): A file-like object to write output messages. To disable
+        printing, set to io.StringIO.
+    stderr (file): A second file-like object to write output messages. To disable
+        printing, set to io.StringIO.
+
+    RETURNS (tuple): The final student nlp object and the path to the exported
+        student model.
+    """
+    # We use no_print here so we can respect the stdout/stderr options.
+    msg = Printer(no_print=True)
+    # Create iterator, which yields out info after each optimization step.
+    config = student.config.interpolate()
+    set_seed_from_config(config)
+    set_gpu_allocator_from_config(config, use_gpu)
+    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    D = registry.resolve(config["distillation"], schema=ConfigSchemaDistill)
+    dot_names = [D["corpus"], T["dev_corpus"]]
+    distill_corpus, dev_corpus = resolve_dot_names(config, dot_names)
+    optimizer = D["optimizer"]
+    score_weights = T["score_weights"]
+    batcher = D["batcher"]
+    train_logger = T["logger"]
+    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+    before_update = T["before_update"]
+    student_to_teacher = D["student_to_teacher"]
+
+    # Helper function to save checkpoints. This is a closure for convenience,
+    # to avoid passing in all the args all the time.
+    def save_checkpoint(is_best):
+        with student.use_params(optimizer.averages):
+            before_to_disk(student).to_disk(output_path / DIR_MODEL_LAST)
+        if is_best:
+            # Avoid saving twice (saving will be more expensive than
+            # the dir copy)
+            if (output_path / DIR_MODEL_BEST).exists():
+                shutil.rmtree(output_path / DIR_MODEL_BEST)
+            shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST)
+
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Components that should set annotations on update
+    annotating_components = T["annotating_components"]
+    # Create iterator, which yields out info after each optimization step.
+    training_step_iterator = _distill_loop(
+        teacher,
+        student,
+        optimizer,
+        create_distill_batches(student, distill_corpus, batcher, D["max_epochs"]),
+        create_evaluation_callback(student, dev_corpus, score_weights),
+        dropout=D["dropout"],
+        accumulate_gradient=T["accumulate_gradient"],
+        max_steps=D["max_steps"],
+        eval_frequency=T["eval_frequency"],
+        exclude=frozen_components,
+        annotating_components=annotating_components,
+        before_update=before_update,
+        student_to_teacher=student_to_teacher,
+    )
+    clean_output_dir(output_path)
+    stdout.write(msg.info(f"Teacher pipeline: {teacher.pipe_names}") + "\n")
+    stdout.write(msg.info(f"Student pipeline: {student.pipe_names}") + "\n")
+    if frozen_components:
+        stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
+    if annotating_components:
+        stdout.write(
+            msg.info(f"Set annotations on update for: {annotating_components}") + "\n"
+        )
+    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate(step=0)}") + "\n")
+    with student.select_pipes(disable=frozen_components):
+        log_step, finalize_logger = train_logger(student, stdout, stderr)
+    try:
+        for batch, info, is_best_checkpoint in training_step_iterator:
+            if is_best_checkpoint is not None:
+                with student.select_pipes(disable=frozen_components):
+                    update_meta(T, student, info)
+                if output_path is not None:
+                    save_checkpoint(is_best_checkpoint)
+                    info["output_path"] = str(output_path / DIR_MODEL_LAST)
+            log_step(info if is_best_checkpoint is not None else None)
+    except Exception as e:
+        if output_path is not None:
+            stdout.write(
+                msg.warn(
+                    f"Aborting and saving the final best model. "
+                    f"Encountered exception: {repr(e)}"
+                )
+                + "\n"
+            )
+        raise e
+    finally:
+        finalize_logger()
+        if output_path is not None:
+            save_checkpoint(False)
+    # This will only run if we did't hit an error
+    if optimizer.averages:
+        student.use_params(optimizer.averages)
+    if output_path is not None:
+        stdout.write(
+            msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST)
+            + "\n"
+        )
+        return (student, output_path / DIR_MODEL_LAST)
+    else:
+        return (student, None)
+
+
 def train(
     nlp: "Language",
     output_path: Optional[Path] = None,
@@ -57,11 +174,8 @@ def train(
     msg = Printer(no_print=True)
     # Create iterator, which yields out info after each optimization step.
     config = nlp.config.interpolate()
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
+    set_seed_from_config(config)
+    set_gpu_allocator_from_config(config, use_gpu)
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     dot_names = [T["train_corpus"], T["dev_corpus"]]
     train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
@@ -150,11 +264,131 @@ def save_checkpoint(is_best):
         return (nlp, None)
 
 
+def _distill_loop(
+    teacher: "Language",
+    student: "Language",
+    optimizer: Optimizer,
+    distill_data: Iterable[List[Example]],
+    evaluate: Callable[[], Tuple[float, Dict[str, float]]],
+    *,
+    dropout: float,
+    eval_frequency: int,
+    accumulate_gradient: int,
+    max_steps: int,
+    exclude: List[str],
+    annotating_components: List[str],
+    before_update: Optional[Callable[["Language", Dict[str, Any]], None]],
+    student_to_teacher: Dict[str, str],
+):
+    """Distill until the data is exhausted or the maximum number of steps
+    has been reached. Works as a generator, with each iteration yielding
+    a tuple `(batch, info, is_best_checkpoint)`, where info is a dict, and
+    is_best_checkpoint is in [True, False, None] -- None indicating that
+    the iteration was not evaluated as a checkpoint. The evaluation is
+    conducted by calling the evaluate callback.
+
+    Positional arguments:
+        teacher (Language): The teacher pipeline to distill from.
+        student (Language): The student pipeline to distill into.
+        optimizer: The optimizer callable.
+        distill_data (Iterable[List[Example]]): A generator of batches,
+            with the distillation data. The distillation data iterable
+            needs to take care of iterating over the epochs and shuffling.
+        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
+            The callback should take no arguments and return a tuple
+            `(main_score, other_scores)`. The main_score should be a float where
+            higher is better. other_scores can be any object.
+
+    Every iteration, the function yields out a tuple with:
+
+    * batch: A list of Example objects.
+    * info: A dict with various information about the last update (see below).
+    * is_best_checkpoint: A value in None, False, True, indicating whether this
+        was the best evaluation so far. You should use this to save the model
+        checkpoints during training. If None, evaluation was not conducted on
+        that iteration. False means evaluation was conducted, but a previous
+        evaluation was better.
+
+    The info dict provides the following information:
+
+        epoch (int): How many passes over the data have been completed.
+        step (int): How many steps have been completed.
+        score (float): The main score from the last evaluation.
+        other_scores: : The other scores from the last evaluation.
+        losses: The accumulated losses throughout training.
+        checkpoints: A list of previous results, where each result is a
+            (score, step, epoch) tuple.
+    """
+    if isinstance(dropout, float):
+        dropouts = constant(dropout)
+    else:
+        dropouts = dropout
+    results = []
+    losses: Dict[str, float] = {}
+    words_seen = 0
+    start_time = timer()
+    for step, (epoch, batch) in enumerate(distill_data):
+        if before_update:
+            before_update_args = {"step": step, "epoch": epoch}
+            before_update(student, before_update_args)
+        dropout = dropouts(optimizer.step)
+        for subbatch in subdivide_batch(batch, accumulate_gradient):
+            student.distill(
+                teacher,
+                subbatch,
+                drop=dropout,
+                losses=losses,
+                sgd=False,
+                exclude=exclude,
+                annotates=annotating_components,
+                student_to_teacher=student_to_teacher,
+            )
+        # TODO: refactor this so we don't have to run it separately in here
+        for student_name, student_proc in student.pipeline:
+            if (
+                student_name not in exclude
+                and isinstance(student_proc, ty.DistillableComponent)
+                and student_proc.is_distillable
+                and student_proc.model not in (False, None)  # type: ignore[attr-defined]
+            ):
+                student_proc.finish_update(optimizer)  # type: ignore[attr-defined]
+        optimizer.step_schedules()
+        if not (step % eval_frequency):
+            if optimizer.averages:
+                with student.use_params(optimizer.averages):
+                    score, other_scores = evaluate()
+            else:
+                score, other_scores = evaluate()
+            optimizer.last_score = score  # type: ignore[assignment]
+            results.append((score, step))
+            is_best_checkpoint = score == max(results)[0]
+        else:
+            score, other_scores = (None, None)
+            is_best_checkpoint = None
+        words_seen += sum(len(eg) for eg in batch)
+        info = {
+            "epoch": epoch,
+            "step": step,
+            "score": score,
+            "other_scores": other_scores,
+            "losses": losses,
+            "checkpoints": results,
+            "seconds": int(timer() - start_time),
+            "words": words_seen,
+        }
+        yield batch, info, is_best_checkpoint
+        if is_best_checkpoint is not None:
+            losses = {}
+        # Stop if we've exhausted our max steps (if specified)
+        if max_steps and step >= max_steps:
+            break
+
+
 def train_while_improving(
     nlp: "Language",
     optimizer: Optimizer,
-    train_data,
-    evaluate,
+    train_data: Iterable[List[Example]],
+    evaluate: Callable[[], Tuple[float, Dict[str, float]]],
     *,
     dropout: float,
     eval_frequency: int,
@@ -174,10 +408,9 @@ def train_while_improving(
     Positional arguments:
         nlp: The spaCy pipeline to evaluate.
         optimizer: The optimizer callable.
-        train_data (Iterable[Batch]): A generator of batches, with the training
-            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
-            data iterable needs to take care of iterating over the epochs and
-            shuffling.
+        train_data (Iterable[List[Example]]): A generator of batches, with the
+            training data. The training data iterable needs to take care of
+            iterating over the epochs and shuffling.
         evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
             The callback should take no arguments and return a tuple
             `(main_score, other_scores)`. The main_score should be a float where
@@ -241,7 +474,7 @@ def train_while_improving(
                     score, other_scores = evaluate()
             else:
                 score, other_scores = evaluate()
-            optimizer.last_score = score
+            optimizer.last_score = score  # type: ignore[assignment]
             results.append((score, step))
             is_best_checkpoint = score == max(results)[0]
         else:
@@ -273,9 +506,15 @@ def train_while_improving(
             break
 
 
-def subdivide_batch(batch, accumulate_gradient):
+def subdivide_batch(
+    batch: Union[Iterable[Doc], Iterable[Example]], accumulate_gradient: int
+):
     batch = list(batch)
-    batch.sort(key=lambda eg: len(eg.predicted))
+    if len(batch):
+        if isinstance(batch[0], Example):
+            batch.sort(key=lambda eg: len(eg.predicted))
+        else:
+            batch.sort(key=lambda doc: len(doc))
     sub_len = len(batch) // accumulate_gradient
     start = 0
     for i in range(accumulate_gradient):
@@ -320,6 +559,22 @@ def evaluate() -> Tuple[float, Dict[str, float]]:
     return evaluate
 
 
+def create_distill_batches(
+    nlp: "Language",
+    corpus: Callable[["Language"], Iterable[Example]],
+    batcher: Callable[[Iterable[Example]], Iterable[List[Example]]],
+    max_epochs: int,
+):
+    """Create distillation batches. In contrast to training, the corpus
+    is normally too large to load into memory and shuffle."""
+    epoch = 0
+    while max_epochs < 1 or epoch != max_epochs:
+        examples = corpus(nlp)
+        for batch in batcher(examples):
+            yield epoch, batch
+        epoch += 1
+
+
 def create_train_batches(
     nlp: "Language",
     corpus: Callable[["Language"], Iterable[Example]],
diff --git a/spacy/util.py b/spacy/util.py
index 7448da8ded0..3bb92e7334c 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -7,6 +7,7 @@
 import thinc
 from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
 from thinc.api import ConfigValidationError, Model, constant as constant_schedule
+from thinc.api import fix_random_seed, set_gpu_allocator
 import functools
 import itertools
 import logging
@@ -1821,3 +1822,22 @@ def find_available_port(start: int, host: str, auto_select: bool = False) -> int
     # if we get here, the port changed
     warnings.warn(Warnings.W124.format(host=host, port=start, serve_port=port))
     return port
+
+
+def set_gpu_allocator_from_config(config: Config, use_gpu: int):
+    """Change the global GPU allocator based to the value in
+    the configuration."""
+    if "gpu_allocator" not in config["training"]:
+        raise ValueError(Errors.E1015.format(value="[training] gpu_allocator"))
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+
+
+def set_seed_from_config(config: Config):
+    """Set the random number generator seed to the value in
+    the configuration."""
+    if "seed" not in config["training"]:
+        raise ValueError(Errors.E1015.format(value="[training] seed"))
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index e38e49bf569..82cb1c14cef 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -347,19 +347,19 @@ Distill the models in a student pipeline from a teacher pipeline.
 > student.distill(teacher, examples, sgd=optimizer)
 > ```
 
-| Name                 | Description                                                                                                                                                                                 |
-| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher`            | The teacher pipeline to distill from. ~~Language~~                                                                                                                                          |
-| `examples`           | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
-| _keyword-only_       |                                                                                                                                                                                             |
-| `drop`               | The dropout rate. ~~float~~                                                                                                                                                                 |
-| `sgd`                | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
-| `losses`             | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                                                             |
-| `component_cfg`      | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                              |
-| `exclude`            | Names of components that shouldn't be updated. Defaults to `[]`. ~~Iterable[str]~~                                                                                                          |
-| `annotates`          | Names of components that should set annotations on the prediced examples after updating. Defaults to `[]`. ~~Iterable[str]~~                                                                |
-| `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                              |
-| **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+| Name                 | Description                                                                                                                                                                                        |
+| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher`            | The teacher pipeline to distill from. ~~Language~~                                                                                                                                                 |
+| `examples`           | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~        |
+| _keyword-only_       |                                                                                                                                                                                                    |
+| `drop`               | The dropout rate. ~~float~~                                                                                                                                                                        |
+| `sgd`                | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ |
+| `losses`             | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                                                                    |
+| `component_cfg`      | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                                     |
+| `exclude`            | Names of components that shouldn't be updated. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                 |
+| `annotates`          | Names of components that should set annotations on the prediced examples after updating. Defaults to `[]`. ~~Iterable[str]~~                                                                       |
+| `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                                     |
+| **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                              |
 
 ## Language.rehearse {id="rehearse",tag="method,experimental",version="3"}
 

From 7e42cb28eeb8534cae693fcb103621d401360303 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 12 Jun 2023 16:16:03 +0200
Subject: [PATCH 291/504] Remove Python 3.7 builds

---
 .github/workflows/tests.yml               | 61 +++++++++++------------
 .github/workflows/universe_validation.yml |  2 +-
 build-constraints.txt                     |  4 +-
 requirements.txt                          |  2 +-
 4 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 840b8e5f968..760a79f2121 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Configure Python version
         uses: actions/setup-python@v4
         with:
-          python-version: "3.7"
+          python-version: "3.8"
           architecture: x64
 
       - name: black
@@ -60,11 +60,9 @@ jobs:
         os: [ubuntu-latest, windows-latest, macos-latest]
         python_version: ["3.12"]
         include:
-          - os: windows-latest
-            python_version: "3.7"
           - os: macos-latest
             python_version: "3.8"
-          - os: ubuntu-latest
+          - os: ubuntu-20.04
             python_version: "3.9"
           - os: windows-latest
             python_version: "3.10"
@@ -95,7 +93,6 @@ jobs:
       - name: Run mypy
         run: |
           python -m mypy spacy
-        if: matrix.python_version != '3.7'
 
       - name: Delete source directory and .egg-info
         run: |
@@ -117,22 +114,22 @@ jobs:
       - name: Test import
         run: python -W error -c "import spacy"
 
-      - name: "Test download CLI"
-        run: |
-          python -m spacy download ca_core_news_sm
-          python -m spacy download ca_core_news_md
-          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
-
-      - name: "Test download_url in info CLI"
-        run: |
-          python -W error -m spacy info ca_core_news_sm | grep -q download_url
-        if: matrix.python_version == '3.9'
-
-      - name: "Test no warnings on load (#11713)"
-        run: |
-          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
+      #      - name: "Test download CLI"
+      #        run: |
+      #          python -m spacy download ca_core_news_sm
+      #          python -m spacy download ca_core_news_md
+      #          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test download_url in info CLI"
+      #        run: |
+      #          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test no warnings on load (#11713)"
+      #        run: |
+      #          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+      #        if: matrix.python_version == '3.9'
 
       - name: "Test convert CLI"
         run: |
@@ -156,17 +153,17 @@ jobs:
           python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
         if: matrix.python_version == '3.9'
 
-      - name: "Test assemble CLI"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-        if: matrix.python_version == '3.9'
-
-      - name: "Test assemble CLI vectors warning"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-        if: matrix.python_version == '3.9'
+      #      - name: "Test assemble CLI"
+      #        run: |
+      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      #          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test assemble CLI vectors warning"
+      #        run: |
+      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+      #          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+      #        if: matrix.python_version == '3.9'
 
       - name: "Install test requirements"
         run: |
diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml
index a1e3253a9ba..c5e68784e00 100644
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@@ -25,7 +25,7 @@ jobs:
       - name: Configure Python version
         uses: actions/setup-python@v4
         with:
-          python-version: "3.7"
+          python-version: "3.8"
           architecture: x64
 
       - name: Validate website/meta/universe.json
diff --git a/build-constraints.txt b/build-constraints.txt
index b1cf596ca7c..781e403c59a 100644
--- a/build-constraints.txt
+++ b/build-constraints.txt
@@ -1,6 +1,4 @@
-# build version constraints for use with wheelwright
-numpy==1.15.0; python_version=='3.7' and platform_machine!='aarch64'
-numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64'
+# build version constraints for use with wheelwright + multibuild
 numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 numpy>=1.25.0; python_version>='3.9'
diff --git a/requirements.txt b/requirements.txt
index a68c159d643..29420430aab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<0.1000; platform_machine != "aarch64"
+mypy>=0.990,<1.1.0; platform_machine != "aarch64"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests

From 7ab1dae1cc072e5bbc131252bab628f814c7155d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 12 Jun 2023 16:43:05 +0200
Subject: [PATCH 292/504] spancat type fixes

---
 spacy/pipeline/spancat.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index bfaaf82e8d0..5c450f36a33 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -514,10 +514,9 @@ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> Non
         indices = activations["indices"]
         assert isinstance(indices, Ragged)
         scores = cast(Floats2d, activations["scores"])
-
         offset = 0
         for i, doc in enumerate(docs):
-            indices_i = indices[i].dataXd
+            indices_i = cast(Ints2d, indices[i].dataXd)
             if self.save_activations:
                 doc.activations[self.name] = {}
                 doc.activations[self.name]["indices"] = indices_i

From cdadabedb08af4887730143263def2810a5422b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 22 Jun 2023 15:38:22 +0200
Subject: [PATCH 293/504] Account for differences between Span.sents in spaCy
 3/4

---
 spacy/tokens/span.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 8662481ac78..34e734ba68f 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -520,13 +520,13 @@ cdef class Span:
                     start = i
                     if start >= self.end:
                         break
-            if start < self.end:
-                spans.append(Span(self.doc, start, self.end))
-        return tuple(spans)
+                elif i == self.doc.length - 1:
+                    spans.append(Span(self.doc, start, self.doc.length))
 
             # Ensure that trailing parts of the Span instance are included in last element of .sents.
             if start == self.doc.length - 1:
-                yield Span(self.doc, start, self.doc.length)
+                spans.append(Span(self.doc, start, self.doc.length))
+        return tuple(spans)
 
     @property
     def ents(self):

From 48c9081451ca0301ca64b4d9a34ec3378af24aa4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 23 Jun 2023 09:43:41 +0200
Subject: [PATCH 294/504] Set version to v4.0.0.dev1 (#12748)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 1ce8a44c9a4..ec1dde7cae6 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,5 +1,5 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "4.0.0.dev0"
+__version__ = "4.0.0.dev1"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 02e719e7542e223efc453edf92bffea10af7542a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 11:41:03 +0200
Subject: [PATCH 295/504] isort all the things

---
 spacy/cli/__init__.py                         |  21 +-
 spacy/cli/_util.py                            |  21 +-
 spacy/cli/convert.py                          |   4 +-
 spacy/cli/debug_data.py                       |  10 +-
 spacy/cli/download.py                         |  12 +-
 spacy/cli/info.py                             |   2 +-
 spacy/cli/init_config.py                      |  13 +-
 spacy/cli/init_pipeline.py                    |  13 +-
 spacy/cli/project/assets.py                   | 218 +++++++++-
 spacy/cli/project/clone.py                    | 125 +++++-
 spacy/cli/project/document.py                 | 116 +++++-
 spacy/cli/project/dvc.py                      | 221 +++++++++-
 spacy/cli/project/pull.py                     |  68 +++-
 spacy/cli/project/push.py                     |  70 +++-
 spacy/cli/project/remote_storage.py           | 213 +++++++++-
 spacy/cli/project/run.py                      | 380 +++++++++++++++++-
 spacy/displacy/render.py                      |   1 +
 spacy/errors.py                               |   2 +-
 spacy/kb/__init__.py                          |   3 +-
 spacy/kb/candidate.pxd                        |   4 +-
 spacy/kb/candidate.pyx                        |   1 +
 spacy/kb/kb.pyx                               |   2 +-
 spacy/kb/kb_in_memory.pyx                     |   5 +-
 spacy/language.py                             |  61 +--
 spacy/lexeme.pxd                              |  17 +-
 spacy/lexeme.pyx                              |   3 +-
 spacy/matcher/dependencymatcher.pyx           |   2 +-
 spacy/matcher/matcher.pyi                     |  17 +-
 spacy/matcher/matcher.pyx                     |  26 +-
 spacy/matcher/phrasematcher.pyi               |   6 +-
 spacy/matcher/phrasematcher.pyx               |   8 +-
 spacy/ml/models/entity_linker.py              |  16 +-
 spacy/ml/models/parser.py                     |   9 +-
 spacy/ml/models/tok2vec.py                    |   2 -
 spacy/ml/staticvectors.py                     |   6 +-
 spacy/ml/tb_framework.pyx                     |  37 +-
 spacy/morphology.pxd                          |   2 +-
 spacy/morphology.pyx                          |   8 +-
 .../pipeline/_edit_tree_internals/schemas.py  |   8 +-
 .../_parser_internals/_beam_utils.pxd         |   1 +
 .../_parser_internals/_beam_utils.pyx         |  11 +-
 spacy/pipeline/_parser_internals/_state.pxd   |   2 -
 .../pipeline/_parser_internals/arc_eager.pyx  |   5 +-
 spacy/pipeline/_parser_internals/ner.pyx      |   9 +-
 spacy/pipeline/_parser_internals/search.pxd   |   6 +-
 spacy/pipeline/_parser_internals/search.pyx   |   5 +-
 .../pipeline/_parser_internals/stateclass.pyx |   3 +-
 .../_parser_internals/transition_system.pyx   |   4 +-
 spacy/pipeline/attribute_ruler.py             |   2 +-
 spacy/pipeline/dep_parser.py                  |  12 +-
 spacy/pipeline/edit_tree_lemmatizer.py        |  10 +-
 spacy/pipeline/entity_linker.py               |  30 +-
 spacy/pipeline/morphologizer.pyx              |  22 +-
 spacy/pipeline/ner.py                         |  21 +-
 spacy/pipeline/pipe.pyx                       |   4 +-
 spacy/pipeline/sentencizer.pyx                |   4 +-
 spacy/pipeline/senter.pyx                     |  12 +-
 spacy/pipeline/span_ruler.py                  |  10 +-
 spacy/pipeline/spancat.py                     |  20 +-
 spacy/pipeline/tagger.pyx                     |  22 +-
 spacy/pipeline/textcat.py                     |   6 +-
 spacy/pipeline/textcat_multilabel.py          |   6 +-
 spacy/pipeline/tok2vec.py                     |   6 +-
 spacy/pipeline/trainable_pipe.pyx             |  13 +-
 spacy/pipeline/transition_parser.pyx          |  52 ++-
 spacy/schemas.py                              |  51 +--
 spacy/strings.pxd                             |   5 +-
 spacy/strings.pyi                             |   3 +-
 spacy/strings.pyx                             |   6 +-
 spacy/tests/conftest.py                       |   8 +-
 spacy/tests/doc/test_span.py                  |   1 -
 spacy/tests/doc/test_underscore.py            |   1 +
 spacy/tests/parser/_search.pyx                |   7 +-
 spacy/tests/parser/test_ner.py                |   3 +-
 spacy/tests/parser/test_parse.py              |  12 +-
 .../pipeline/test_edit_tree_lemmatizer.py     |   3 +-
 spacy/tests/pipeline/test_entity_linker.py    |   2 +-
 spacy/tests/pipeline/test_entity_ruler.py     |   8 +-
 spacy/tests/pipeline/test_initialize.py       |   7 +-
 spacy/tests/pipeline/test_morphologizer.py    |   3 +-
 spacy/tests/pipeline/test_pipe_factories.py   |   2 +
 spacy/tests/pipeline/test_senter.py           |   1 +
 spacy/tests/pipeline/test_spancat.py          |   7 +-
 spacy/tests/pipeline/test_tagger.py           |   3 +-
 spacy/tests/pipeline/test_textcat.py          |  18 +-
 .../tests/serialize/test_serialize_config.py  |  25 +-
 .../serialize/test_serialize_pipeline.py      |  11 +-
 spacy/tests/test_cli.py                       |  20 +-
 spacy/tests/test_cli_app.py                   |   2 +-
 spacy/tests/test_language.py                  |  16 +-
 spacy/tests/test_misc.py                      |  20 +-
 spacy/tests/test_symbols.py                   |   1 +
 spacy/tests/training/test_loop.py             |   4 +-
 spacy/tests/training/test_training.py         |  15 +-
 spacy/tokenizer.pxd                           |   5 -
 spacy/tokenizer.pyx                           |   8 +-
 spacy/tokens/__init__.py                      |   4 +-
 spacy/tokens/doc.pyi                          |  12 +-
 spacy/tokens/doc.pyx                          |  23 +-
 spacy/tokens/doc_bin.py                       |   4 +-
 spacy/tokens/graph.pyx                        |   6 +-
 spacy/tokens/morphanalysis.pxd                |   7 +-
 spacy/tokens/morphanalysis.pyx                |  10 +-
 spacy/tokens/retokenizer.pyx                  |  10 +-
 spacy/tokens/span.pxd                         |   2 +-
 spacy/tokens/span.pyx                         |  13 +-
 spacy/tokens/span_group.pyx                   |   7 +-
 spacy/tokens/token.pyx                        |   4 +-
 spacy/training/__init__.py                    |  29 +-
 spacy/training/align.pyx                      |   1 -
 spacy/training/batchers.py                    |  13 +
 spacy/training/callbacks.py                   |   6 +-
 spacy/training/converters/json_to_docs.py     |  12 +-
 spacy/training/example.pyx                    |   1 +
 spacy/training/gold_io.pyx                    |   4 +-
 spacy/training/initialize.py                  |  29 +-
 spacy/training/loop.py                        |  34 +-
 spacy/ty.py                                   |  16 +-
 spacy/util.py                                 |  27 +-
 spacy/vectors.pyx                             |  12 +-
 spacy/vocab.pyx                               |   3 +
 121 files changed, 2016 insertions(+), 602 deletions(-)

diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 3095778fe22..b2612f57720 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -16,7 +16,6 @@
 from .debug_model import debug_model  # noqa: F401
 from .download import download  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
-from .find_function import find_function  # noqa: F401
 from .find_threshold import find_threshold  # noqa: F401
 from .info import info  # noqa: F401
 from .init_config import fill_config, init_config  # noqa: F401
@@ -24,17 +23,15 @@
 from .package import package  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .profile import profile  # noqa: F401
-from .project.assets import project_assets  # type: ignore[attr-defined]  # noqa: F401
-from .project.clone import project_clone  # type: ignore[attr-defined]  # noqa: F401
-from .project.document import (  # type: ignore[attr-defined]  # noqa: F401
-    project_document,
-)
-from .project.dvc import project_update_dvc  # type: ignore[attr-defined]  # noqa: F401
-from .project.pull import project_pull  # type: ignore[attr-defined]  # noqa: F401
-from .project.push import project_push  # type: ignore[attr-defined]  # noqa: F401
-from .project.run import project_run  # type: ignore[attr-defined]  # noqa: F401
-from .train import train_cli  # type: ignore[attr-defined]  # noqa: F401
-from .validate import validate  # type: ignore[attr-defined]  # noqa: F401
+from .project.assets import project_assets  # noqa: F401
+from .project.clone import project_clone  # noqa: F401
+from .project.document import project_document  # noqa: F401
+from .project.dvc import project_update_dvc  # noqa: F401
+from .project.pull import project_pull  # noqa: F401
+from .project.push import project_push  # noqa: F401
+from .project.run import project_run  # noqa: F401
+from .train import train_cli  # noqa: F401
+from .validate import validate  # noqa: F401
 
 
 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 52a70cc7320..b005accf91f 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,10 +1,3 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
-from typing import TYPE_CHECKING, overload
-import sys
-import shutil
-from pathlib import Path
-from wasabi import msg, Printer
-import srsly
 import hashlib
 import os
 import shutil
@@ -18,6 +11,7 @@
     Dict,
     Iterable,
     List,
+    Literal,
     Optional,
     Tuple,
     Union,
@@ -32,15 +26,10 @@
 from thinc.util import gpu_is_available
 from typer.main import get_command
 from wasabi import Printer, msg
-from weasel import app as project_cli
 
-from ..schemas import ProjectConfigSchema, validate
-from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
-from ..errors import RENAMED_LANGUAGE_CODES
 from .. import about
-from ..compat import Literal
-from ..schemas import validate
+from ..errors import RENAMED_LANGUAGE_CODES
+from ..schemas import ProjectConfigSchema, validate
 from ..util import (
     ENV_VARS,
     SimpleFrozenDict,
@@ -52,6 +41,10 @@
     run_command,
 )
 
+if TYPE_CHECKING:
+    from pathy import FluidPath  # noqa: F401
+
+
 SDIST_SUFFIX = ".tar.gz"
 WHEEL_SUFFIX = "-py3-none-any.whl"
 
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 3844b340678..a282e59c749 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -8,8 +8,6 @@
 import srsly
 from wasabi import Printer
 
-from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
-from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training import docs_to_json
 from ..training.converters import (
@@ -18,7 +16,7 @@
     iob_to_docs,
     json_to_docs,
 )
-from ._util import Arg, Opt, app, walk_directory
+from ._util import Arg, Opt, _handle_renamed_language_codes, app, walk_directory
 
 # Converters are matched by file extension except for ner/iob, which are
 # matched by file extension and content. To add a converter, add a new
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index c2253b0cb70..4c44a8c0e2b 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,11 +1,3 @@
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
-from typing import Literal, cast, overload
-from pathlib import Path
-from collections import Counter
-import sys
-import srsly
-from wasabi import Printer, MESSAGES, msg
-import typer
 import math
 import sys
 from collections import Counter
@@ -15,6 +7,7 @@
     Dict,
     Iterable,
     List,
+    Literal,
     Optional,
     Sequence,
     Set,
@@ -30,7 +23,6 @@
 from wasabi import MESSAGES, Printer, msg
 
 from .. import util
-from ..compat import Literal
 from ..language import Language
 from ..morphology import Morphology
 from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index ea7a06c5417..0635522930b 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,9 +7,15 @@
 from wasabi import msg
 
 from .. import about
-from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version, get_installed_models
-from ..util import get_package_version
+from ..util import (
+    get_installed_models,
+    get_minor_version,
+    get_package_version,
+    is_package,
+    is_prerelease_version,
+    run_command,
+)
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
 
 
 @app.command(
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 8bfc6b54f15..7a891547e0a 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -1,3 +1,4 @@
+import importlib.metadata
 import json
 import platform
 from pathlib import Path
@@ -7,7 +8,6 @@
 from wasabi import MarkdownRenderer, Printer
 
 from .. import about, util
-from ..compat import importlib_metadata
 from ._util import Arg, Opt, app, string_to_list
 from .download import get_latest_version, get_model_filename
 
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index b29a2b748f2..ca0c316ca20 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -12,9 +12,16 @@
 from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
-from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
-from ._util import string_to_list, import_code, _handle_renamed_language_codes
-
+from ._util import (
+    COMMAND,
+    Arg,
+    Opt,
+    _handle_renamed_language_codes,
+    import_code,
+    init_cli,
+    show_validation_error,
+    string_to_list,
+)
 
 ROOT = Path(__file__).parent / "templates"
 TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 1a044dedbc9..991dc1a822c 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -8,8 +8,17 @@
 
 from .. import util
 from ..language import Language
-from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu, _handle_renamed_language_codes
+from ..training.initialize import convert_vectors, init_nlp
+from ._util import (
+    Arg,
+    Opt,
+    _handle_renamed_language_codes,
+    import_code,
+    init_cli,
+    parse_config_overrides,
+    setup_gpu,
+    show_validation_error,
+)
 
 
 @init_cli.command("vectors")
diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
index 591d1959e73..aa270598621 100644
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@@ -1 +1,217 @@
-from weasel.cli.assets import *
+import os
+import re
+import shutil
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import requests
+import typer
+from wasabi import msg
+
+from ...util import ensure_path, working_dir
+from .._util import (
+    PROJECT_FILE,
+    Arg,
+    Opt,
+    SimpleFrozenDict,
+    download_file,
+    get_checksum,
+    get_git_version,
+    git_checkout,
+    load_project_config,
+    parse_config_overrides,
+    project_cli,
+)
+
+# Whether assets are extra if `extra` is not set.
+EXTRA_DEFAULT = False
+
+
+@project_cli.command(
+    "assets",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def project_assets_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
+    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
+    extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
+    # fmt: on
+):
+    """Fetch project assets like datasets and pretrained weights. Assets are
+    defined in the "assets" section of the project.yml. If a checksum is
+    provided in the project.yml, the file is only downloaded if no local file
+    with the same checksum exists.
+
+    DOCS: https://spacy.io/api/cli#project-assets
+    """
+    overrides = parse_config_overrides(ctx.args)
+    project_assets(
+        project_dir,
+        overrides=overrides,
+        sparse_checkout=sparse_checkout,
+        extra=extra,
+    )
+
+
+def project_assets(
+    project_dir: Path,
+    *,
+    overrides: Dict[str, Any] = SimpleFrozenDict(),
+    sparse_checkout: bool = False,
+    extra: bool = False,
+) -> None:
+    """Fetch assets for a project using DVC if possible.
+
+    project_dir (Path): Path to project directory.
+    sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
+                            needed.
+    extra (bool): Whether to download all assets, including those marked as 'extra'.
+    """
+    project_path = ensure_path(project_dir)
+    config = load_project_config(project_path, overrides=overrides)
+    assets = [
+        asset
+        for asset in config.get("assets", [])
+        if extra or not asset.get("extra", EXTRA_DEFAULT)
+    ]
+    if not assets:
+        msg.warn(
+            f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
+            exits=0,
+        )
+    msg.info(f"Fetching {len(assets)} asset(s)")
+
+    for asset in assets:
+        dest = (project_dir / asset["dest"]).resolve()
+        checksum = asset.get("checksum")
+        if "git" in asset:
+            git_err = (
+                f"Cloning spaCy project templates requires Git and the 'git' command. "
+                f"Make sure it's installed and that the executable is available."
+            )
+            get_git_version(error=git_err)
+            if dest.exists():
+                # If there's already a file, check for checksum
+                if checksum and checksum == get_checksum(dest):
+                    msg.good(
+                        f"Skipping download with matching checksum: {asset['dest']}"
+                    )
+                    continue
+                else:
+                    if dest.is_dir():
+                        shutil.rmtree(dest)
+                    else:
+                        dest.unlink()
+            if "repo" not in asset["git"] or asset["git"]["repo"] is None:
+                msg.fail(
+                    "A git asset must include 'repo', the repository address.", exits=1
+                )
+            if "path" not in asset["git"] or asset["git"]["path"] is None:
+                msg.fail(
+                    "A git asset must include 'path' - use \"\" to get the entire repository.",
+                    exits=1,
+                )
+            git_checkout(
+                asset["git"]["repo"],
+                asset["git"]["path"],
+                dest,
+                branch=asset["git"].get("branch"),
+                sparse=sparse_checkout,
+            )
+            msg.good(f"Downloaded asset {dest}")
+        else:
+            url = asset.get("url")
+            if not url:
+                # project.yml defines asset without URL that the user has to place
+                check_private_asset(dest, checksum)
+                continue
+            fetch_asset(project_path, url, dest, checksum)
+
+
+def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
+    """Check and validate assets without a URL (private assets that the user
+    has to provide themselves) and give feedback about the checksum.
+
+    dest (Path): Destination path of the asset.
+    checksum (Optional[str]): Optional checksum of the expected file.
+    """
+    if not Path(dest).exists():
+        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
+        msg.warn(err)
+    else:
+        if not checksum:
+            msg.good(f"Asset already exists: {dest}")
+        elif checksum == get_checksum(dest):
+            msg.good(f"Asset exists with matching checksum: {dest}")
+        else:
+            msg.fail(f"Asset available but with incorrect checksum: {dest}")
+
+
+def fetch_asset(
+    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
+) -> None:
+    """Fetch an asset from a given URL or path. If a checksum is provided and a
+    local file exists, it's only re-downloaded if the checksum doesn't match.
+
+    project_path (Path): Path to project directory.
+    url (str): URL or path to asset.
+    checksum (Optional[str]): Optional expected checksum of local file.
+    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
+        the asset failed.
+    """
+    dest_path = (project_path / dest).resolve()
+    if dest_path.exists():
+        # If there's already a file, check for checksum
+        if checksum:
+            if checksum == get_checksum(dest_path):
+                msg.good(f"Skipping download with matching checksum: {dest}")
+                return
+        else:
+            # If there's not a checksum, make sure the file is a possibly valid size
+            if os.path.getsize(dest_path) == 0:
+                msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
+                os.remove(dest_path)
+    # We might as well support the user here and create parent directories in
+    # case the asset dir isn't listed as a dir to create in the project.yml
+    if not dest_path.parent.exists():
+        dest_path.parent.mkdir(parents=True)
+    with working_dir(project_path):
+        url = convert_asset_url(url)
+        try:
+            download_file(url, dest_path)
+            msg.good(f"Downloaded asset {dest}")
+        except requests.exceptions.RequestException as e:
+            if Path(url).exists() and Path(url).is_file():
+                # If it's a local file, copy to destination
+                shutil.copy(url, str(dest_path))
+                msg.good(f"Copied local asset {dest}")
+            else:
+                msg.fail(f"Download failed: {dest}", e)
+    if checksum and checksum != get_checksum(dest_path):
+        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
+
+
+def convert_asset_url(url: str) -> str:
+    """Check and convert the asset URL if needed.
+
+    url (str): The asset URL.
+    RETURNS (str): The converted URL.
+    """
+    # If the asset URL is a regular GitHub URL it's likely a mistake
+    if (
+        re.match(r"(http(s?)):\/\/github.com", url)
+        and "releases/download" not in url
+        and "/raw/" not in url
+    ):
+        converted = url.replace("github.com", "raw.githubusercontent.com")
+        converted = re.sub(r"/(tree|blob)/", "/", converted)
+        msg.warn(
+            "Downloading from a regular GitHub URL. This will only download "
+            "the source of the page, not the actual file. Converting the URL "
+            "to a raw URL.",
+            converted,
+        )
+        return converted
+    return url
diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py
index 11d2511a361..2ee27c92adb 100644
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@@ -1 +1,124 @@
-from weasel.cli.clone import *
+import re
+import subprocess
+from pathlib import Path
+from typing import Optional
+
+from wasabi import msg
+
+from ... import about
+from ...util import ensure_path
+from .._util import (
+    COMMAND,
+    PROJECT_FILE,
+    Arg,
+    Opt,
+    get_git_version,
+    git_checkout,
+    git_repo_branch_exists,
+    project_cli,
+)
+
+DEFAULT_REPO = about.__projects__
+DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
+DEFAULT_BRANCHES = ["main", "master"]
+
+
+@project_cli.command("clone")
+def project_clone_cli(
+    # fmt: off
+    name: str = Arg(..., help="The name of the template to clone"),
+    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
+    repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
+    branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
+    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
+    # fmt: on
+):
+    """Clone a project template from a repository. Calls into "git" and will
+    only download the files from the given subdirectory. The GitHub repo
+    defaults to the official spaCy template repo, but can be customized
+    (including using a private repo).
+
+    DOCS: https://spacy.io/api/cli#project-clone
+    """
+    if dest is None:
+        dest = Path.cwd() / Path(name).parts[-1]
+    if repo == DEFAULT_REPO and branch is None:
+        branch = DEFAULT_PROJECTS_BRANCH
+
+    if branch is None:
+        for default_branch in DEFAULT_BRANCHES:
+            if git_repo_branch_exists(repo, default_branch):
+                branch = default_branch
+                break
+        if branch is None:
+            default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
+            msg.fail(
+                "No branch provided and attempted default "
+                f"branches {default_branches_msg} do not exist.",
+                exits=1,
+            )
+    else:
+        if not git_repo_branch_exists(repo, branch):
+            msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
+    assert isinstance(branch, str)
+    project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
+
+
+def project_clone(
+    name: str,
+    dest: Path,
+    *,
+    repo: str = about.__projects__,
+    branch: str = about.__projects_branch__,
+    sparse_checkout: bool = False,
+) -> None:
+    """Clone a project template from a repository.
+
+    name (str): Name of subdirectory to clone.
+    dest (Path): Destination path of cloned project.
+    repo (str): URL of Git repo containing project templates.
+    branch (str): The branch to clone from
+    """
+    dest = ensure_path(dest)
+    check_clone(name, dest, repo)
+    project_dir = dest.resolve()
+    repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
+    try:
+        git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
+    except subprocess.CalledProcessError:
+        err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
+        msg.fail(err, exits=1)
+    msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
+    if not (project_dir / PROJECT_FILE).exists():
+        msg.warn(f"No {PROJECT_FILE} found in directory")
+    else:
+        msg.good(f"Your project is now ready!")
+        print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
+
+
+def check_clone(name: str, dest: Path, repo: str) -> None:
+    """Check and validate that the destination path can be used to clone. Will
+    check that Git is available and that the destination path is suitable.
+
+    name (str): Name of the directory to clone from the repo.
+    dest (Path): Local destination of cloned directory.
+    repo (str): URL of the repo to clone from.
+    """
+    git_err = (
+        f"Cloning spaCy project templates requires Git and the 'git' command. "
+        f"To clone a project without Git, copy the files from the '{name}' "
+        f"directory in the {repo} to {dest} manually."
+    )
+    get_git_version(error=git_err)
+    if not dest:
+        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
+    if dest.exists():
+        # Directory already exists (not allowed, clone needs to create it)
+        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
+    if not dest.parent.exists():
+        # We're not creating parents, parent dir should exist
+        msg.fail(
+            f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
+            f"Create the necessary folder(s) first before continuing.",
+            exits=1,
+        )
diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py
index 1952524a933..80107d27acf 100644
--- a/spacy/cli/project/document.py
+++ b/spacy/cli/project/document.py
@@ -1 +1,115 @@
-from weasel.cli.document import *
+from pathlib import Path
+
+from wasabi import MarkdownRenderer, msg
+
+from ...util import working_dir
+from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
+
+DOCS_URL = "https://spacy.io"
+INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
+project, as well as the available commands and workflows. For details, see the
+[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
+INTRO_COMMANDS = f"""The following commands are defined by the project. They
+can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
+Commands are only re-run if their inputs have changed."""
+INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
+can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
+and will run the specified commands in order. Commands are only re-run if their
+inputs have changed."""
+INTRO_ASSETS = f"""The following assets are defined by the project. They can
+be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
+in the project directory."""
+# These markers are added to the Markdown and can be used to update the file in
+# place if it already exists. Only the auto-generated part will be replaced.
+MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
+MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
+# If this marker is used in an existing README, it's ignored and not replaced
+MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
+
+
+@project_cli.command("document")
+def project_document_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
+    output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
+    no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
+    # fmt: on
+):
+    """
+    Auto-generate a README.md for a project. If the content is saved to a file,
+    hidden markers are added so you can add custom content before or after the
+    auto-generated section and only the auto-generated docs will be replaced
+    when you re-run the command.
+
+    DOCS: https://spacy.io/api/cli#project-document
+    """
+    project_document(project_dir, output_file, no_emoji=no_emoji)
+
+
+def project_document(
+    project_dir: Path, output_file: Path, *, no_emoji: bool = False
+) -> None:
+    is_stdout = str(output_file) == "-"
+    config = load_project_config(project_dir)
+    md = MarkdownRenderer(no_emoji=no_emoji)
+    md.add(MARKER_START)
+    title = config.get("title")
+    description = config.get("description")
+    md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
+    if description:
+        md.add(description)
+    md.add(md.title(2, PROJECT_FILE, "📋"))
+    md.add(INTRO_PROJECT)
+    # Commands
+    cmds = config.get("commands", [])
+    data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
+    if data:
+        md.add(md.title(3, "Commands", "⏯"))
+        md.add(INTRO_COMMANDS)
+        md.add(md.table(data, ["Command", "Description"]))
+    # Workflows
+    wfs = config.get("workflows", {}).items()
+    data = [(md.code(n), " &rarr; ".join(md.code(w) for w in stp)) for n, stp in wfs]
+    if data:
+        md.add(md.title(3, "Workflows", "⏭"))
+        md.add(INTRO_WORKFLOWS)
+        md.add(md.table(data, ["Workflow", "Steps"]))
+    # Assets
+    assets = config.get("assets", [])
+    data = []
+    for a in assets:
+        source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
+        dest_path = a["dest"]
+        dest = md.code(dest_path)
+        if source == "Local":
+            # Only link assets if they're in the repo
+            with working_dir(project_dir) as p:
+                if (p / dest_path).exists():
+                    dest = md.link(dest, dest_path)
+        data.append((dest, source, a.get("description", "")))
+    if data:
+        md.add(md.title(3, "Assets", "🗂"))
+        md.add(INTRO_ASSETS)
+        md.add(md.table(data, ["File", "Source", "Description"]))
+    md.add(MARKER_END)
+    # Output result
+    if is_stdout:
+        print(md.text)
+    else:
+        content = md.text
+        if output_file.exists():
+            with output_file.open("r", encoding="utf8") as f:
+                existing = f.read()
+            if MARKER_IGNORE in existing:
+                msg.warn("Found ignore marker in existing file: skipping", output_file)
+                return
+            if MARKER_START in existing and MARKER_END in existing:
+                msg.info("Found existing file: only replacing auto-generated docs")
+                before = existing.split(MARKER_START)[0]
+                after = existing.split(MARKER_END)[1]
+                content = f"{before}{content}{after}"
+            else:
+                msg.warn("Replacing existing file")
+        with output_file.open("w", encoding="utf8") as f:
+            f.write(content)
+        msg.good("Saved project documentation", output_file)
diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py
index aa1ae7dd9ed..9ad55c43302 100644
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@@ -1 +1,220 @@
-from weasel.cli.dvc import *
+"""This module contains helpers and subcommands for integrating spaCy projects
+with Data Version Controk (DVC). https://dvc.org"""
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+
+from wasabi import msg
+
+from ...util import (
+    SimpleFrozenList,
+    join_command,
+    run_command,
+    split_command,
+    working_dir,
+)
+from .._util import (
+    COMMAND,
+    NAME,
+    PROJECT_FILE,
+    Arg,
+    Opt,
+    get_hash,
+    load_project_config,
+    project_cli,
+)
+
+DVC_CONFIG = "dvc.yaml"
+DVC_DIR = ".dvc"
+UPDATE_COMMAND = "dvc"
+DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
+# edited your {PROJECT_FILE}, you can regenerate this file by running:
+# {COMMAND} project {UPDATE_COMMAND}"""
+
+
+@project_cli.command(UPDATE_COMMAND)
+def project_update_dvc_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
+    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
+    quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
+    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
+    # fmt: on
+):
+    """Auto-generate Data Version Control (DVC) config. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. If no workflow is specified, the first defined
+    workflow is used. The DVC config will only be updated if the project.yml
+    changed.
+
+    DOCS: https://spacy.io/api/cli#project-dvc
+    """
+    project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
+
+
+def project_update_dvc(
+    project_dir: Path,
+    workflow: Optional[str] = None,
+    *,
+    verbose: bool = False,
+    quiet: bool = False,
+    force: bool = False,
+) -> None:
+    """Update the auto-generated Data Version Control (DVC) config file. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. Will only update the file if the checksum changed.
+
+    project_dir (Path): The project directory.
+    workflow (Optional[str]): Optional name of workflow defined in project.yml.
+        If not set, the first workflow will be used.
+    verbose (bool): Print more info.
+    quiet (bool): Print less info.
+    force (bool): Force update DVC config.
+    """
+    config = load_project_config(project_dir)
+    updated = update_dvc_config(
+        project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
+    )
+    help_msg = "To execute the workflow with DVC, run: dvc repro"
+    if updated:
+        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
+    else:
+        msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
+
+
+def update_dvc_config(
+    path: Path,
+    config: Dict[str, Any],
+    workflow: Optional[str] = None,
+    verbose: bool = False,
+    quiet: bool = False,
+    force: bool = False,
+) -> bool:
+    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
+    project directory. The file is auto-generated based on the config. The
+    first line of the auto-generated file specifies the hash of the config
+    dict, so if any of the config values change, the DVC config is regenerated.
+
+    path (Path): The path to the project directory.
+    config (Dict[str, Any]): The loaded project.yml.
+    verbose (bool): Whether to print additional info (via DVC).
+    quiet (bool): Don't output anything (via DVC).
+    force (bool): Force update, even if hashes match.
+    RETURNS (bool): Whether the DVC config file was updated.
+    """
+    ensure_dvc(path)
+    workflows = config.get("workflows", {})
+    workflow_names = list(workflows.keys())
+    check_workflows(workflow_names, workflow)
+    if not workflow:
+        workflow = workflow_names[0]
+    config_hash = get_hash(config)
+    path = path.resolve()
+    dvc_config_path = path / DVC_CONFIG
+    if dvc_config_path.exists():
+        # Check if the file was generated using the current config, if not, redo
+        with dvc_config_path.open("r", encoding="utf8") as f:
+            ref_hash = f.readline().strip().replace("# ", "")
+        if ref_hash == config_hash and not force:
+            return False  # Nothing has changed in project.yml, don't need to update
+        dvc_config_path.unlink()
+    dvc_commands = []
+    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+
+    # some flags that apply to every command
+    flags = []
+    if verbose:
+        flags.append("--verbose")
+    if quiet:
+        flags.append("--quiet")
+
+    for name in workflows[workflow]:
+        command = config_commands[name]
+        deps = command.get("deps", [])
+        outputs = command.get("outputs", [])
+        outputs_no_cache = command.get("outputs_no_cache", [])
+        if not deps and not outputs and not outputs_no_cache:
+            continue
+        # Default to the working dir as the project path since dvc.yaml is auto-generated
+        # and we don't want arbitrary paths in there
+        project_cmd = ["python", "-m", NAME, "project", "run", name]
+        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
+        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
+        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
+
+        dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
+        if command.get("no_skip"):
+            dvc_cmd.append("--always-changed")
+        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
+        dvc_commands.append(join_command(full_cmd))
+
+    if not dvc_commands:
+        # If we don't check for this, then there will be an error when reading the
+        # config, since DVC wouldn't create it.
+        msg.fail(
+            "No usable commands for DVC found. This can happen if none of your "
+            "commands have dependencies or outputs.",
+            exits=1,
+        )
+
+    with working_dir(path):
+        for c in dvc_commands:
+            dvc_command = "dvc " + c
+            run_command(dvc_command)
+    with dvc_config_path.open("r+", encoding="utf8") as f:
+        content = f.read()
+        f.seek(0, 0)
+        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
+    return True
+
+
+def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
+    """Validate workflows provided in project.yml and check that a given
+    workflow can be used to generate a DVC config.
+
+    workflows (List[str]): Names of the available workflows.
+    workflow (Optional[str]): The name of the workflow to convert.
+    """
+    if not workflows:
+        msg.fail(
+            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
+            f"define at least one list of commands.",
+            exits=1,
+        )
+    if workflow is not None and workflow not in workflows:
+        msg.fail(
+            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
+            f"Available workflows: {', '.join(workflows)}",
+            exits=1,
+        )
+    if not workflow:
+        msg.warn(
+            f"No workflow specified for DVC pipeline. Using the first workflow "
+            f"defined in {PROJECT_FILE}: '{workflows[0]}'"
+        )
+
+
+def ensure_dvc(project_dir: Path) -> None:
+    """Ensure that the "dvc" command is available and that the current project
+    directory is an initialized DVC project.
+    """
+    try:
+        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
+    except Exception:
+        msg.fail(
+            "To use spaCy projects with DVC (Data Version Control), DVC needs "
+            "to be installed and the 'dvc' command needs to be available",
+            "You can install the Python package from pip (pip install dvc) or "
+            "conda (conda install -c conda-forge dvc). For more details, see the "
+            "documentation: https://dvc.org/doc/install",
+            exits=1,
+        )
+    if not (project_dir / ".dvc").exists():
+        msg.fail(
+            "Project not initialized as a DVC project",
+            "To initialize a DVC project, you can run 'dvc init' in the project "
+            "directory. For more details, see the documentation: "
+            "https://dvc.org/doc/command-reference/init",
+            exits=1,
+        )
diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index 5e603273d94..e9be74df7f4 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -1 +1,67 @@
-from weasel.cli.pull import *
+from pathlib import Path
+
+from wasabi import msg
+
+from .._util import Arg, load_project_config, logger, project_cli
+from .remote_storage import RemoteStorage, get_command_hash
+from .run import update_lockfile
+
+
+@project_cli.command("pull")
+def project_pull_cli(
+    # fmt: off
+    remote: str = Arg("default", help="Name or path of remote storage"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    # fmt: on
+):
+    """Retrieve available precomputed outputs from a remote storage.
+    You can alias remotes in your project.yml by mapping them to storage paths.
+    A storage can be anything that the smart-open library can upload to, e.g.
+    AWS, Google Cloud Storage, SSH, local directories etc.
+
+    DOCS: https://spacy.io/api/cli#project-pull
+    """
+    for url, output_path in project_pull(project_dir, remote):
+        if url is not None:
+            msg.good(f"Pulled {output_path} from {url}")
+
+
+def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
+    # TODO: We don't have tests for this :(. It would take a bit of mockery to
+    # set up. I guess see if it breaks first?
+    config = load_project_config(project_dir)
+    if remote in config.get("remotes", {}):
+        remote = config["remotes"][remote]
+    storage = RemoteStorage(project_dir, remote)
+    commands = list(config.get("commands", []))
+    # We use a while loop here because we don't know how the commands
+    # will be ordered. A command might need dependencies from one that's later
+    # in the list.
+    while commands:
+        for i, cmd in enumerate(list(commands)):
+            logger.debug("CMD: %s.", cmd["name"])
+            deps = [project_dir / dep for dep in cmd.get("deps", [])]
+            if all(dep.exists() for dep in deps):
+                cmd_hash = get_command_hash("", "", deps, cmd["script"])
+                for output_path in cmd.get("outputs", []):
+                    url = storage.pull(output_path, command_hash=cmd_hash)
+                    logger.debug(
+                        "URL: %s for %s with command hash %s",
+                        url,
+                        output_path,
+                        cmd_hash,
+                    )
+                    yield url, output_path
+
+                out_locs = [project_dir / out for out in cmd.get("outputs", [])]
+                if all(loc.exists() for loc in out_locs):
+                    update_lockfile(project_dir, cmd)
+                # We remove the command from the list here, and break, so that
+                # we iterate over the loop again.
+                commands.pop(i)
+                break
+            else:
+                logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
+        else:
+            # If we didn't break the for loop, break the while loop.
+            break
diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py
index 3a8e8869db1..a7915e54741 100644
--- a/spacy/cli/project/push.py
+++ b/spacy/cli/project/push.py
@@ -1 +1,69 @@
-from weasel.cli.push import *
+from pathlib import Path
+
+from wasabi import msg
+
+from .._util import Arg, load_project_config, logger, project_cli
+from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
+
+
+@project_cli.command("push")
+def project_push_cli(
+    # fmt: off
+    remote: str = Arg("default", help="Name or path of remote storage"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    # fmt: on
+):
+    """Persist outputs to a remote storage. You can alias remotes in your
+    project.yml by mapping them to storage paths. A storage can be anything that
+    the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
+    local directories etc.
+
+    DOCS: https://spacy.io/api/cli#project-push
+    """
+    for output_path, url in project_push(project_dir, remote):
+        if url is None:
+            msg.info(f"Skipping {output_path}")
+        else:
+            msg.good(f"Pushed {output_path} to {url}")
+
+
+def project_push(project_dir: Path, remote: str):
+    """Persist outputs to a remote storage. You can alias remotes in your project.yml
+    by mapping them to storage paths. A storage can be anything that the smart-open
+    library can upload to, e.g. gcs, aws, ssh, local directories etc
+    """
+    config = load_project_config(project_dir)
+    if remote in config.get("remotes", {}):
+        remote = config["remotes"][remote]
+    storage = RemoteStorage(project_dir, remote)
+    for cmd in config.get("commands", []):
+        logger.debug("CMD: %s", cmd["name"])
+        deps = [project_dir / dep for dep in cmd.get("deps", [])]
+        if any(not dep.exists() for dep in deps):
+            logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
+            continue
+        cmd_hash = get_command_hash(
+            "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
+        )
+        logger.debug("CMD_HASH: %s", cmd_hash)
+        for output_path in cmd.get("outputs", []):
+            output_loc = project_dir / output_path
+            if output_loc.exists() and _is_not_empty_dir(output_loc):
+                url = storage.push(
+                    output_path,
+                    command_hash=cmd_hash,
+                    content_hash=get_content_hash(output_loc),
+                )
+                logger.debug(
+                    "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
+                )
+                yield output_path, url
+
+
+def _is_not_empty_dir(loc: Path):
+    if not loc.is_dir():
+        return True
+    elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
+        return True
+    else:
+        return False
diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py
index 29409150fad..84235a90d39 100644
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@@ -1 +1,212 @@
-from weasel.cli.remote_storage import *
+import hashlib
+import os
+import site
+import tarfile
+import urllib.parse
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, List, Optional
+
+from wasabi import msg
+
+from ... import about
+from ...errors import Errors
+from ...git_info import GIT_VERSION
+from ...util import ENV_VARS, check_bool_env_var, get_minor_version
+from .._util import (
+    download_file,
+    ensure_pathy,
+    get_checksum,
+    get_hash,
+    make_tempdir,
+    upload_file,
+)
+
+if TYPE_CHECKING:
+    from pathy import FluidPath  # noqa: F401
+
+
+class RemoteStorage:
+    """Push and pull outputs to and from a remote file storage.
+
+    Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
+    ssh, etc.
+    """
+
+    def __init__(self, project_root: Path, url: str, *, compression="gz"):
+        self.root = project_root
+        self.url = ensure_pathy(url)
+        self.compression = compression
+
+    def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
+        """Compress a file or directory within a project and upload it to a remote
+        storage. If an object exists at the full URL, nothing is done.
+
+        Within the remote storage, files are addressed by their project path
+        (url encoded) and two user-supplied hashes, representing their creation
+        context and their file contents. If the URL already exists, the data is
+        not uploaded. Paths are archived and compressed prior to upload.
+        """
+        loc = self.root / path
+        if not loc.exists():
+            raise IOError(f"Cannot push {loc}: does not exist.")
+        url = self.make_url(path, command_hash, content_hash)
+        if url.exists():
+            return url
+        tmp: Path
+        with make_tempdir() as tmp:
+            tar_loc = tmp / self.encode_name(str(path))
+            mode_string = f"w:{self.compression}" if self.compression else "w"
+            with tarfile.open(tar_loc, mode=mode_string) as tar_file:
+                tar_file.add(str(loc), arcname=str(path))
+            upload_file(tar_loc, url)
+        return url
+
+    def pull(
+        self,
+        path: Path,
+        *,
+        command_hash: Optional[str] = None,
+        content_hash: Optional[str] = None,
+    ) -> Optional["FluidPath"]:
+        """Retrieve a file from the remote cache. If the file already exists,
+        nothing is done.
+
+        If the command_hash and/or content_hash are specified, only matching
+        results are returned. If no results are available, an error is raised.
+        """
+        dest = self.root / path
+        if dest.exists():
+            return None
+        url = self.find(path, command_hash=command_hash, content_hash=content_hash)
+        if url is None:
+            return url
+        else:
+            # Make sure the destination exists
+            if not dest.parent.exists():
+                dest.parent.mkdir(parents=True)
+            tmp: Path
+            with make_tempdir() as tmp:
+                tar_loc = tmp / url.parts[-1]
+                download_file(url, tar_loc)
+                mode_string = f"r:{self.compression}" if self.compression else "r"
+                with tarfile.open(tar_loc, mode=mode_string) as tar_file:
+                    # This requires that the path is added correctly, relative
+                    # to root. This is how we set things up in push()
+
+                    # Disallow paths outside the current directory for the tar
+                    # file (CVE-2007-4559, directory traversal vulnerability)
+                    def is_within_directory(directory, target):
+                        abs_directory = os.path.abspath(directory)
+                        abs_target = os.path.abspath(target)
+                        prefix = os.path.commonprefix([abs_directory, abs_target])
+                        return prefix == abs_directory
+
+                    def safe_extract(tar, path):
+                        for member in tar.getmembers():
+                            member_path = os.path.join(path, member.name)
+                            if not is_within_directory(path, member_path):
+                                raise ValueError(Errors.E852)
+                        tar.extractall(path)
+
+                    safe_extract(tar_file, self.root)
+        return url
+
+    def find(
+        self,
+        path: Path,
+        *,
+        command_hash: Optional[str] = None,
+        content_hash: Optional[str] = None,
+    ) -> Optional["FluidPath"]:
+        """Find the best matching version of a file within the storage,
+        or `None` if no match can be found. If both the creation and content hash
+        are specified, only exact matches will be returned. Otherwise, the most
+        recent matching file is preferred.
+        """
+        name = self.encode_name(str(path))
+        urls = []
+        if command_hash is not None and content_hash is not None:
+            url = self.url / name / command_hash / content_hash
+            urls = [url] if url.exists() else []
+        elif command_hash is not None:
+            if (self.url / name / command_hash).exists():
+                urls = list((self.url / name / command_hash).iterdir())
+        else:
+            if (self.url / name).exists():
+                for sub_dir in (self.url / name).iterdir():
+                    urls.extend(sub_dir.iterdir())
+                if content_hash is not None:
+                    urls = [url for url in urls if url.parts[-1] == content_hash]
+        if len(urls) >= 2:
+            try:
+                urls.sort(key=lambda x: x.stat().last_modified)  # type: ignore
+            except Exception:
+                msg.warn(
+                    "Unable to sort remote files by last modified. The file(s) "
+                    "pulled from the cache may not be the most recent."
+                )
+        return urls[-1] if urls else None
+
+    def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
+        """Construct a URL from a subpath, a creation hash and a content hash."""
+        return self.url / self.encode_name(str(path)) / command_hash / content_hash
+
+    def encode_name(self, name: str) -> str:
+        """Encode a subpath into a URL-safe name."""
+        return urllib.parse.quote_plus(name)
+
+
+def get_content_hash(loc: Path) -> str:
+    return get_checksum(loc)
+
+
+def get_command_hash(
+    site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
+) -> str:
+    """Create a hash representing the execution of a command. This includes the
+    currently installed packages, whatever environment variables have been marked
+    as relevant, and the command.
+    """
+    if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
+        spacy_v = GIT_VERSION
+    else:
+        spacy_v = str(get_minor_version(about.__version__) or "")
+    dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
+    hashes = [spacy_v, site_hash, env_hash] + dep_checksums
+    hashes.extend(cmd)
+    creation_bytes = "".join(hashes).encode("utf8")
+    return hashlib.md5(creation_bytes).hexdigest()
+
+
+def get_site_hash():
+    """Hash the current Python environment's site-packages contents, including
+    the name and version of the libraries. The list we're hashing is what
+    `pip freeze` would output.
+    """
+    site_dirs = site.getsitepackages()
+    if site.ENABLE_USER_SITE:
+        site_dirs.extend(site.getusersitepackages())
+    packages = set()
+    for site_dir in site_dirs:
+        site_dir = Path(site_dir)
+        for subpath in site_dir.iterdir():
+            if subpath.parts[-1].endswith("dist-info"):
+                packages.add(subpath.parts[-1].replace(".dist-info", ""))
+    package_bytes = "".join(sorted(packages)).encode("utf8")
+    return hashlib.md5sum(package_bytes).hexdigest()
+
+
+def get_env_hash(env: Dict[str, str]) -> str:
+    """Construct a hash of the environment variables that will be passed into
+    the commands.
+
+    Values in the env dict may be references to the current os.environ, using
+    the syntax $ENV_VAR to mean os.environ[ENV_VAR]
+    """
+    env_vars = {}
+    for key, value in env.items():
+        if value.startswith("$"):
+            env_vars[key] = os.environ.get(value[1:], "")
+        else:
+            env_vars[key] = value
+    return get_hash(env_vars)
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index cc6a5ac4256..43972a2026a 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -1 +1,379 @@
-from weasel.cli.run import *
+import os.path
+import sys
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
+
+import srsly
+import typer
+from wasabi import msg
+from wasabi.util import locale_escape
+
+from ... import about
+from ...git_info import GIT_VERSION
+from ...util import (
+    ENV_VARS,
+    SimpleFrozenDict,
+    SimpleFrozenList,
+    check_bool_env_var,
+    is_cwd,
+    is_minor_version_match,
+    join_command,
+    run_command,
+    split_command,
+    working_dir,
+)
+from .._util import (
+    COMMAND,
+    PROJECT_FILE,
+    PROJECT_LOCK,
+    Arg,
+    Opt,
+    get_checksum,
+    get_hash,
+    load_project_config,
+    parse_config_overrides,
+    project_cli,
+)
+
+
+@project_cli.command(
+    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
+)
+def project_run_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
+    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
+    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
+    # fmt: on
+):
+    """Run a named command or workflow defined in the project.yml. If a workflow
+    name is specified, all commands in the workflow are run, in order. If
+    commands define dependencies and/or outputs, they will only be re-run if
+    state has changed.
+
+    DOCS: https://spacy.io/api/cli#project-run
+    """
+    if show_help or not subcommand:
+        print_run_help(project_dir, subcommand)
+    else:
+        overrides = parse_config_overrides(ctx.args)
+        project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
+
+
+def project_run(
+    project_dir: Path,
+    subcommand: str,
+    *,
+    overrides: Dict[str, Any] = SimpleFrozenDict(),
+    force: bool = False,
+    dry: bool = False,
+    capture: bool = False,
+    skip_requirements_check: bool = False,
+) -> None:
+    """Run a named script defined in the project.yml. If the script is part
+    of the default pipeline (defined in the "run" section), DVC is used to
+    execute the command, so it can determine whether to rerun it. It then
+    calls into "exec" to execute it.
+
+    project_dir (Path): Path to project directory.
+    subcommand (str): Name of command to run.
+    overrides (Dict[str, Any]): Optional config overrides.
+    force (bool): Force re-running, even if nothing changed.
+    dry (bool): Perform a dry run and don't execute commands.
+    capture (bool): Whether to capture the output and errors of individual commands.
+        If False, the stdout and stderr will not be redirected, and if there's an error,
+        sys.exit will be called with the return code. You should use capture=False
+        when you want to turn over execution to the command, and capture=True
+        when you want to run the command more like a function.
+    skip_requirements_check (bool): Whether to skip the requirements check.
+    """
+    config = load_project_config(project_dir, overrides=overrides)
+    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+    workflows = config.get("workflows", {})
+    validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
+
+    req_path = project_dir / "requirements.txt"
+    if not skip_requirements_check:
+        if config.get("check_requirements", True) and os.path.exists(req_path):
+            with req_path.open() as requirements_file:
+                _check_requirements([req.strip() for req in requirements_file])
+
+    if subcommand in workflows:
+        msg.info(f"Running workflow '{subcommand}'")
+        for cmd in workflows[subcommand]:
+            project_run(
+                project_dir,
+                cmd,
+                overrides=overrides,
+                force=force,
+                dry=dry,
+                capture=capture,
+                skip_requirements_check=True,
+            )
+    else:
+        cmd = commands[subcommand]
+        for dep in cmd.get("deps", []):
+            if not (project_dir / dep).exists():
+                err = f"Missing dependency specified by command '{subcommand}': {dep}"
+                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
+                err_exits = 1 if not dry else None
+                msg.fail(err, err_help, exits=err_exits)
+        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
+        with working_dir(project_dir) as current_dir:
+            msg.divider(subcommand)
+            rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
+            if not rerun and not force:
+                msg.info(f"Skipping '{cmd['name']}': nothing changed")
+            else:
+                run_commands(cmd["script"], dry=dry, capture=capture)
+                if not dry:
+                    update_lockfile(current_dir, cmd)
+
+
+def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
+    """Simulate a CLI help prompt using the info available in the project.yml.
+
+    project_dir (Path): The project directory.
+    subcommand (Optional[str]): The subcommand or None. If a subcommand is
+        provided, the subcommand help is shown. Otherwise, the top-level help
+        and a list of available commands is printed.
+    """
+    config = load_project_config(project_dir)
+    config_commands = config.get("commands", [])
+    commands = {cmd["name"]: cmd for cmd in config_commands}
+    workflows = config.get("workflows", {})
+    project_loc = "" if is_cwd(project_dir) else project_dir
+    if subcommand:
+        validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
+        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
+        if subcommand in commands:
+            help_text = commands[subcommand].get("help")
+            if help_text:
+                print(f"\n{help_text}\n")
+        elif subcommand in workflows:
+            steps = workflows[subcommand]
+            print(f"\nWorkflow consisting of {len(steps)} commands:")
+            steps_data = [
+                (f"{i + 1}. {step}", commands[step].get("help", ""))
+                for i, step in enumerate(steps)
+            ]
+            msg.table(steps_data)
+            help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
+            print(f"For command details, run: {help_cmd}")
+    else:
+        print("")
+        title = config.get("title")
+        if title:
+            print(f"{locale_escape(title)}\n")
+        if config_commands:
+            print(f"Available commands in {PROJECT_FILE}")
+            print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
+            msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
+        if workflows:
+            print(f"Available workflows in {PROJECT_FILE}")
+            print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
+            msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
+
+
+def run_commands(
+    commands: Iterable[str] = SimpleFrozenList(),
+    silent: bool = False,
+    dry: bool = False,
+    capture: bool = False,
+) -> None:
+    """Run a sequence of commands in a subprocess, in order.
+
+    commands (List[str]): The string commands.
+    silent (bool): Don't print the commands.
+    dry (bool): Perform a dry run and don't execut anything.
+    capture (bool): Whether to capture the output and errors of individual commands.
+        If False, the stdout and stderr will not be redirected, and if there's an error,
+        sys.exit will be called with the return code. You should use capture=False
+        when you want to turn over execution to the command, and capture=True
+        when you want to run the command more like a function.
+    """
+    for c in commands:
+        command = split_command(c)
+        # Not sure if this is needed or a good idea. Motivation: users may often
+        # use commands in their config that reference "python" and we want to
+        # make sure that it's always executing the same Python that spaCy is
+        # executed with and the pip in the same env, not some other Python/pip.
+        # Also ensures cross-compatibility if user 1 writes "python3" (because
+        # that's how it's set up on their system), and user 2 without the
+        # shortcut tries to re-run the command.
+        if len(command) and command[0] in ("python", "python3"):
+            command[0] = sys.executable
+        elif len(command) and command[0] in ("pip", "pip3"):
+            command = [sys.executable, "-m", "pip", *command[1:]]
+        if not silent:
+            print(f"Running command: {join_command(command)}")
+        if not dry:
+            run_command(command, capture=capture)
+
+
+def validate_subcommand(
+    commands: Sequence[str], workflows: Sequence[str], subcommand: str
+) -> None:
+    """Check that a subcommand is valid and defined. Raises an error otherwise.
+
+    commands (Sequence[str]): The available commands.
+    subcommand (str): The subcommand.
+    """
+    if not commands and not workflows:
+        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
+    if subcommand not in commands and subcommand not in workflows:
+        help_msg = []
+        if subcommand in ["assets", "asset"]:
+            help_msg.append("Did you mean to run: python -m spacy project assets?")
+        if commands:
+            help_msg.append(f"Available commands: {', '.join(commands)}")
+        if workflows:
+            help_msg.append(f"Available workflows: {', '.join(workflows)}")
+        msg.fail(
+            f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
+            ". ".join(help_msg),
+            exits=1,
+        )
+
+
+def check_rerun(
+    project_dir: Path,
+    command: Dict[str, Any],
+    *,
+    check_spacy_version: bool = True,
+    check_spacy_commit: bool = False,
+) -> bool:
+    """Check if a command should be rerun because its settings or inputs/outputs
+    changed.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    strict_version (bool):
+    RETURNS (bool): Whether to re-run the command.
+    """
+    # Always rerun if no-skip is set
+    if command.get("no_skip", False):
+        return True
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():  # We don't have a lockfile, run command
+        return True
+    data = srsly.read_yaml(lock_path)
+    if command["name"] not in data:  # We don't have info about this command
+        return True
+    entry = data[command["name"]]
+    # Always run commands with no outputs (otherwise they'd always be skipped)
+    if not entry.get("outs", []):
+        return True
+    # Always rerun if spaCy version or commit hash changed
+    spacy_v = entry.get("spacy_version")
+    commit = entry.get("spacy_git_version")
+    if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
+        info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
+        msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
+        return True
+    if check_spacy_commit and commit != GIT_VERSION:
+        info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
+        msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
+        return True
+    # If the entry in the lockfile matches the lockfile entry that would be
+    # generated from the current command, we don't rerun because it means that
+    # all inputs/outputs, hashes and scripts are the same and nothing changed
+    lock_entry = get_lock_entry(project_dir, command)
+    exclude = ["spacy_version", "spacy_git_version"]
+    return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
+
+
+def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
+    """Update the lockfile after running a command. Will create a lockfile if
+    it doesn't yet exist and will add an entry for the current command, its
+    script and dependencies/outputs.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    """
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():
+        srsly.write_yaml(lock_path, {})
+        data = {}
+    else:
+        data = srsly.read_yaml(lock_path)
+    data[command["name"]] = get_lock_entry(project_dir, command)
+    srsly.write_yaml(lock_path, data)
+
+
+def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
+    """Get a lockfile entry for a given command. An entry includes the command,
+    the script (command steps) and a list of dependencies and outputs with
+    their paths and file hashes, if available. The format is based on the
+    dvc.lock files, to keep things consistent.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    RETURNS (Dict[str, Any]): The lockfile entry.
+    """
+    deps = get_fileinfo(project_dir, command.get("deps", []))
+    outs = get_fileinfo(project_dir, command.get("outputs", []))
+    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
+    return {
+        "cmd": f"{COMMAND} run {command['name']}",
+        "script": command["script"],
+        "deps": deps,
+        "outs": [*outs, *outs_nc],
+        "spacy_version": about.__version__,
+        "spacy_git_version": GIT_VERSION,
+    }
+
+
+def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
+    """Generate the file information for a list of paths (dependencies, outputs).
+    Includes the file path and the file's checksum.
+
+    project_dir (Path): The current project directory.
+    paths (List[str]): The file paths.
+    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
+    """
+    data = []
+    for path in paths:
+        file_path = project_dir / path
+        md5 = get_checksum(file_path) if file_path.exists() else None
+        data.append({"path": path, "md5": md5})
+    return data
+
+
+def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
+    """Checks whether requirements are installed and free of version conflicts.
+    requirements (List[str]): List of requirements.
+    RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
+        exist.
+    """
+    import pkg_resources
+
+    failed_pkgs_msgs: List[str] = []
+    conflicting_pkgs_msgs: List[str] = []
+
+    for req in requirements:
+        try:
+            pkg_resources.require(req)
+        except pkg_resources.DistributionNotFound as dnf:
+            failed_pkgs_msgs.append(dnf.report())
+        except pkg_resources.VersionConflict as vc:
+            conflicting_pkgs_msgs.append(vc.report())
+        except Exception:
+            msg.warn(
+                f"Unable to check requirement: {req} "
+                "Checks are currently limited to requirement specifiers "
+                "(PEP 508)"
+            )
+
+    if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
+        msg.warn(
+            title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
+            "correctly and you installed all requirements specified in your project's requirements.txt: "
+        )
+        for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
+            msg.text(pgk_msg)
+
+    return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 40b9986e85b..b7f689bcb3e 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -1,3 +1,4 @@
+import itertools
 import uuid
 from typing import Any, Dict, List, Optional, Tuple, Union
 
diff --git a/spacy/errors.py b/spacy/errors.py
index fe067f7915d..4909371d549 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,5 +1,5 @@
-from typing import Literal
 import warnings
+from typing import Literal
 
 
 class ErrorsWithCodes(type):
diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
index 7155c15df9a..2aa084ef52a 100644
--- a/spacy/kb/__init__.py
+++ b/spacy/kb/__init__.py
@@ -1,6 +1,5 @@
-from .candidate import Candidate, get_candidates, get_candidates_batch
+from .candidate import Candidate, InMemoryCandidate
 from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB
-from .candidate import Candidate, InMemoryCandidate
 
 __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd
index f21f423e496..4419ed47666 100644
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@@ -1,6 +1,8 @@
 from libcpp.vector cimport vector
-from .kb_in_memory cimport InMemoryLookupKB
+
 from ..typedefs cimport hash_t
+from .kb_in_memory cimport InMemoryLookupKB
+
 
 cdef class Candidate:
     pass
diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx
index bf66ccfae67..1739cfa64f6 100644
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@@ -1,6 +1,7 @@
 # cython: infer_types=True
 
 from .kb_in_memory cimport InMemoryLookupKB
+
 from ..errors import Errors
 
 
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index bb58bf88a46..c3479eabc18 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -5,7 +5,7 @@ from typing import Iterable, Tuple, Union
 
 from cymem.cymem cimport Pool
 
-from .candidate import Candidate
+from ..errors import Errors
 from ..tokens import Span, SpanGroup
 from ..util import SimpleFrozenList
 from .candidate import Candidate
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index 3aab0d73e72..fee407e68b2 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -1,5 +1,5 @@
-# cython: infer_types=True
-from typing import Any, Callable, Dict, Iterable
+# cython: infer_types=True, profile=True
+from typing import Any, Callable, Dict, Iterable, Union
 
 import srsly
 
@@ -22,6 +22,7 @@ from ..util import SimpleFrozenList, ensure_path
 
 from ..vocab cimport Vocab
 from .kb cimport KnowledgeBase
+
 from .candidate import InMemoryCandidate
 
 
diff --git a/spacy/language.py b/spacy/language.py
index 028f733200e..ea641224684 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,12 +1,4 @@
-from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Literal
-from typing import Union, Tuple, List, Set, Pattern, Sequence
-from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
-
-from dataclasses import dataclass
-import random
-import itertools
 import functools
-import inspect
 import itertools
 import multiprocessing as mp
 import random
@@ -25,6 +17,7 @@
     Iterable,
     Iterator,
     List,
+    Literal,
     NoReturn,
     Optional,
     Pattern,
@@ -37,29 +30,41 @@
     overload,
 )
 
-from . import ty
-from .tokens.underscore import Underscore
-from .vocab import Vocab, create_vocab
-from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .training import Example, validate_examples, validate_distillation_examples
-from .training.initialize import init_vocab, init_tok2vec
-from .scorer import Scorer
-from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
-from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
-from .util import warn_if_jupyter_cupy
-from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
-from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
-from .lang.punctuation import TOKENIZER_INFIXES
-from .tokens import Doc
-from .tokenizer import Tokenizer
+import srsly
+from thinc.api import Config, CupyOps, Optimizer, get_current_ops
+
+from . import about, ty, util
 from .errors import Errors, Warnings
-from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
-from .schemas import ConfigSchemaPretrain, validate_init_settings
 from .git_info import GIT_VERSION
-from . import util
-from . import about
+from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .lang.tokenizer_exceptions import BASE_EXCEPTIONS, URL_MATCH
 from .lookups import load_lookups
-
+from .pipe_analysis import analyze_pipes, print_pipe_analysis, validate_attrs
+from .schemas import (
+    ConfigSchema,
+    ConfigSchemaInit,
+    ConfigSchemaNlp,
+    ConfigSchemaPretrain,
+    validate_init_settings,
+)
+from .scorer import Scorer
+from .tokenizer import Tokenizer
+from .tokens import Doc
+from .tokens.underscore import Underscore
+from .training import Example, validate_distillation_examples, validate_examples
+from .training.initialize import init_tok2vec, init_vocab
+from .util import (
+    _DEFAULT_EMPTY_PIPES,
+    CONFIG_SECTION_ORDER,
+    SimpleFrozenDict,
+    SimpleFrozenList,
+    _pipe,
+    combine_score_weights,
+    raise_error,
+    registry,
+    warn_if_jupyter_cupy,
+)
+from .vocab import Vocab, create_vocab
 
 PipeCallable = Callable[[Doc], Doc]
 
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 2d14edcd6b0..ff51d77e8a9 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,10 +1,19 @@
 from numpy cimport ndarray
 
-from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
-from .attrs cimport attr_id_t
-from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
-
+from .attrs cimport (
+    ID,
+    LANG,
+    LENGTH,
+    LOWER,
+    NORM,
+    ORTH,
+    PREFIX,
+    SHAPE,
+    SUFFIX,
+    attr_id_t,
+)
 from .structs cimport LexemeC
+from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
 from .vocab cimport Vocab
 
 
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index b0c3784d86e..7e9fd0b37e5 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -2,6 +2,7 @@
 # cython: profile=False
 # Compiler crashes on memory view coercion without this. Should report bug.
 cimport numpy as np
+from cython.view cimport array as cvarray
 from libc.string cimport memset
 
 np.import_array()
@@ -35,7 +36,7 @@ from .typedefs cimport attr_t, flags_t
 from .attrs import intify_attrs
 from .errors import Errors, Warnings
 
-OOV_RANK = 0xffffffffffffffff  # UINT64_MAX
+OOV_RANK = 0xffffffffffffffff # UINT64_MAX
 memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 EMPTY_LEXEME.id = OOV_RANK
 
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 0b639ab04fb..60299603623 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True
+# cython: infer_types=True, profile=True
 import warnings
 from collections import defaultdict
 from itertools import product
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index a0b6d91e7d5..fe2d8bec3bc 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -1,6 +1,17 @@
-from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Literal
-from typing import Iterator, Iterable, overload
-from ..vocab import Vocab
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+    overload,
+)
+
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 7e734ac247e..8accd8c4465 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -1,4 +1,4 @@
-# cython: binding=True, infer_types=True
+# cython: binding=True, infer_types=True, profile=True
 from typing import Iterable, List
 
 from cymem.cymem cimport Pool
@@ -12,23 +12,35 @@ import warnings
 
 import srsly
 
-from ..attrs cimport DEP, ENT_IOB, ID, LEMMA, MORPH, NULL_ATTR, POS, TAG
+from ..attrs cimport (
+    DEP,
+    ENT_IOB,
+    ID,
+    LEMMA,
+    MORPH,
+    NULL_ATTR,
+    ORTH,
+    POS,
+    TAG,
+    attr_id_t,
+)
 from ..structs cimport TokenC
 from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.morphanalysis cimport MorphAnalysis
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
+from ..vocab cimport Vocab
 
-from ..schemas import validate_token_pattern
-from ..errors import Errors, MatchPatternError, Warnings
-from ..strings cimport get_string_id
-from ..attrs import IDS
 from ..errors import Errors, MatchPatternError, Warnings
 from ..schemas import validate_token_pattern
-from ..strings import get_string_id
 from .levenshtein import levenshtein_compare
 
+from ..strings cimport get_string_id
+
+from ..attrs import IDS
+from ..util import registry
+
 DEF PADDING = 5
 
 
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index 45685db228a..d3c679a65d5 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -1,7 +1,5 @@
-from typing import List, Tuple, Union, Optional, Callable, Any, Dict, Literal
-from typing import overload
-from .matcher import Matcher
-from ..vocab import Vocab
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, overload
+
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 from .matcher import Matcher
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 6e3c52924fa..107d7d926ee 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,15 +1,17 @@
 # cython: infer_types=True, profile=True
-from typing import List
 from collections import defaultdict
+from typing import List
+
 from libc.stdint cimport uintptr_t
-from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
+from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
 
 import warnings
 
-from ..attrs cimport DEP, LEMMA, MORPH, POS, TAG
+from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG
 
 from ..attrs import IDS
 
+from ..structs cimport TokenC
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index db960fbd0a9..987eb6733d3 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -14,21 +14,9 @@
 )
 from thinc.types import Floats2d
 
-from ...util import registry
-from ...kb import KnowledgeBase, InMemoryLookupKB
-from ...kb import Candidate
-from ...vocab import Vocab
-from ...tokens import Doc, Span, SpanGroup
-from ..extract_spans import extract_spans
 from ...errors import Errors
-from ...kb import (
-    Candidate,
-    InMemoryLookupKB,
-    KnowledgeBase,
-    get_candidates,
-    get_candidates_batch,
-)
-from ...tokens import Doc, Span
+from ...kb import Candidate, InMemoryLookupKB, KnowledgeBase
+from ...tokens import Doc, Span, SpanGroup
 from ...util import registry
 from ...vocab import Vocab
 from ..extract_spans import extract_spans
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 01312983d86..422abf4e260 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,12 +1,13 @@
-from typing import Optional, List, Tuple, Any, Literal
-from thinc.types import Floats2d
-from thinc.api import Model
 import warnings
+from typing import Any, List, Literal, Optional, Tuple
+
+from thinc.api import Model
+from thinc.types import Floats2d
 
 from ...errors import Errors, Warnings
+from ...tokens.doc import Doc
 from ...util import registry
 from ..tb_framework import TransitionModel
-from ...tokens.doc import Doc
 
 TransitionSystem = Any  # TODO
 State = Any  # TODO
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index a605d32cd40..61bc7291e2e 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -22,8 +22,6 @@
 from ...attrs import intify_attr
 from ...errors import Errors
 from ...ml import character_embed
-from ..staticvectors import StaticVectors
-from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
 from ...tokens import Doc
 from ...util import registry
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 1a1b0a0fffd..3b9a9ce2dd1 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -1,4 +1,3 @@
-import warnings
 from typing import Callable, List, Optional, Sequence, Tuple, cast
 
 from thinc.api import Model, Ops, registry
@@ -6,10 +5,9 @@
 from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 from thinc.util import partial
 
-from ..attrs import ORTH
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..tokens import Doc
-from ..vectors import Mode, Vectors
+from ..vectors import Mode
 from ..vocab import Vocab
 
 
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 9b2114900d3..fd0af12ceab 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -1,28 +1,45 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import List, Tuple, Any, Optional, TypeVar, cast
-from libc.string cimport memset, memcpy
+from typing import Any, List, Optional, Tuple, TypeVar, cast
+
 from libc.stdlib cimport calloc, free, realloc
+from libc.string cimport memcpy, memset
 from libcpp.vector cimport vector
+
 import numpy
+
 cimport numpy as np
-from thinc.api import Model, normal_init, chain, list2array, Linear
-from thinc.api import uniform_init, glorot_uniform_init, zero_init
-from thinc.api import NumpyOps
+
+from thinc.api import (
+    Linear,
+    Model,
+    NumpyOps,
+    chain,
+    glorot_uniform_init,
+    list2array,
+    normal_init,
+    uniform_init,
+    zero_init,
+)
+
 from thinc.backends.cblas cimport CBlas, saxpy, sgemm
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
-from thinc.types import Ints1d, Ints2d
+
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
 
 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
 from ..pipeline._parser_internals.batch import GreedyBatch
+
 from ..pipeline._parser_internals._parser_utils cimport arg_max
-from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
-from ..pipeline._parser_internals.transition_system cimport TransitionSystem
 from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..pipeline._parser_internals.transition_system cimport (
+    TransitionSystem,
+    c_apply_actions,
+    c_transition_batch,
+)
+
 from ..tokens.doc import Doc
 from ..util import registry
 
-
 State = Any  # TODO
 
 
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 494088879b1..5138d353cf0 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,8 +1,8 @@
 cimport numpy as np
 from libc.stdint cimport uint32_t, uint64_t
+from libcpp.memory cimport shared_ptr
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.memory cimport shared_ptr
 
 from .strings cimport StringStore
 from .structs cimport MorphAnalysisC
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 7ee621056f1..d75c1071941 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,14 +1,14 @@
 # cython: infer_types
-# cython: profile=False
 import warnings
-from typing import Union, Tuple, List, Dict, Optional
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy
+
 from cython.operator cimport dereference as deref
 from libcpp.memory cimport shared_ptr
 
-from .errors import Warnings
 from . import symbols
 from .errors import Warnings
-from .parts_of_speech import IDS as POS_IDS
 
 
 cdef class Morphology:
diff --git a/spacy/pipeline/_edit_tree_internals/schemas.py b/spacy/pipeline/_edit_tree_internals/schemas.py
index 89f2861ceac..1e307b66cb9 100644
--- a/spacy/pipeline/_edit_tree_internals/schemas.py
+++ b/spacy/pipeline/_edit_tree_internals/schemas.py
@@ -1,12 +1,8 @@
 from collections import defaultdict
 from typing import Any, Dict, List, Union
 
-try:
-    from pydantic.v1 import BaseModel, Field, ValidationError
-    from pydantic.v1.types import StrictBool, StrictInt, StrictStr
-except ImportError:
-    from pydantic import BaseModel, Field, ValidationError  # type: ignore
-    from pydantic.types import StrictBool, StrictInt, StrictStr  # type: ignore
+from pydantic import BaseModel, Field, ValidationError
+from pydantic.types import StrictBool, StrictInt, StrictStr
 
 
 class MatchNodeSchema(BaseModel):
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pxd b/spacy/pipeline/_parser_internals/_beam_utils.pxd
index 571f246b1e3..5a452e56a88 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd
@@ -1,5 +1,6 @@
 from ...typedefs cimport class_t, hash_t
 
+
 # These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
 
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index c86de231d09..7098b822ef0 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -1,14 +1,21 @@
 # cython: infer_types=True
+# cython: profile=True
+cimport numpy as np
+
 import numpy
-from cpython.ref cimport PyObject, Py_XDECREF
 
-from ...typedefs cimport class_t
+from cpython.ref cimport Py_XDECREF, PyObject
+
+from ...typedefs cimport class_t, hash_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
+
 from .batch cimport Batch
 from .search cimport Beam, MaxViolation
+
 from .search import MaxViolation
+
 from .stateclass cimport StateC, StateClass
 
 
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 1b6b25e5f16..1c61ac271d8 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,5 +1,4 @@
 cimport libcpp
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as incr
 from libc.stdint cimport uint32_t, uint64_t
@@ -8,7 +7,6 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 673e36bf5ac..08f60b2634b 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -1,4 +1,4 @@
-# cython: cdivision=True, infer_types=True
+# cython: profile=True, cdivision=True, infer_types=True
 from cymem.cymem cimport Address, Pool
 from libc.stdint cimport int32_t
 from libcpp.vector cimport vector
@@ -9,7 +9,7 @@ from ...strings cimport hash_string
 from ...structs cimport TokenC
 from ...tokens.doc cimport Doc, set_children_from_heads
 from ...tokens.token cimport MISSING_DEP
-from ...typedefs cimport attr_t
+from ...typedefs cimport attr_t, hash_t
 
 from ...training import split_bilu_label
 
@@ -18,6 +18,7 @@ from ._state cimport ArcC, StateC
 from .stateclass cimport StateClass
 
 from ...errors import Errors
+
 from .search cimport Beam
 
 
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index cf19c834ed9..5c31ff5c21d 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -1,10 +1,10 @@
 import os
 import random
+
+from cymem.cymem cimport Pool
 from libc.stdint cimport int32_t
 from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
-from cymem.cymem cimport Pool
-from libc.stdint cimport int32_t
 
 from collections import Counter
 
@@ -14,16 +14,15 @@ from ...tokens.span import Span
 
 from ...attrs cimport IS_SPACE
 from ...lexeme cimport Lexeme
-from ...structs cimport SpanC
+from ...structs cimport SpanC, TokenC
 from ...tokens.span cimport Span
 from ...typedefs cimport attr_t, weight_t
 
 from ...training import split_bilu_label
 
 from ...training.example cimport Example
-from .search cimport Beam
-from .stateclass cimport StateClass
 from ._state cimport StateC
+from .search cimport Beam
 from .stateclass cimport StateClass
 from .transition_system cimport Transition, do_func_t
 
diff --git a/spacy/pipeline/_parser_internals/search.pxd b/spacy/pipeline/_parser_internals/search.pxd
index dfe30e1c130..4626496335a 100644
--- a/spacy/pipeline/_parser_internals/search.pxd
+++ b/spacy/pipeline/_parser_internals/search.pxd
@@ -1,12 +1,10 @@
 from cymem.cymem cimport Pool
-
-from libc.stdint cimport uint32_t
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint32_t, uint64_t
 from libcpp.pair cimport pair
 from libcpp.queue cimport priority_queue
 from libcpp.vector cimport vector
 
-from ...typedefs cimport class_t, weight_t, hash_t
+from ...typedefs cimport class_t, hash_t, weight_t
 
 ctypedef pair[weight_t, size_t] Entry
 ctypedef priority_queue[Entry] Queue
diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
index 1d9b6dd7adf..251eaa805cb 100644
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -1,7 +1,8 @@
 # cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
-from libc.string cimport memset, memcpy
-from libc.math cimport log, exp
+from libc.math cimport exp, log
+from libc.string cimport memcpy, memset
+
 import math
 
 from cymem.cymem cimport Pool
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index e49ff63c48b..c2a0d22956a 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -1,5 +1,6 @@
 # cython: infer_types=True
-# cython: profile=False
+import numpy
+
 from libcpp.vector cimport vector
 
 from ...tokens.doc cimport Doc
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index d1340d68c62..a433ce7dc75 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -11,9 +11,11 @@ from collections import Counter
 import srsly
 
 from ...structs cimport TokenC
+from ...tokens.doc cimport Doc
 from ...typedefs cimport attr_t, weight_t
-from .stateclass cimport StateClass
+from . cimport _beam_utils
 from ._parser_utils cimport arg_max_if_valid
+from .stateclass cimport StateClass
 
 from ... import util
 from ...errors import Errors
diff --git a/spacy/pipeline/attribute_ruler.py b/spacy/pipeline/attribute_ruler.py
index 126a48945bc..76f82b84e38 100644
--- a/spacy/pipeline/attribute_ruler.py
+++ b/spacy/pipeline/attribute_ruler.py
@@ -11,7 +11,7 @@
 from ..symbols import IDS
 from ..tokens import Doc, Span
 from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs
-from ..vocab import Vocab
+from ..training import Example
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
 from .pipe import Pipe
diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.py
index 370a698c25a..b4961487b83 100644
--- a/spacy/pipeline/dep_parser.py
+++ b/spacy/pipeline/dep_parser.py
@@ -1,23 +1,19 @@
 # cython: infer_types=True, binding=True
 from collections import defaultdict
-from typing import Callable, Optional
+from typing import Callable, Iterable, Optional
 
 from thinc.api import Config, Model
 
-from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
-from ._parser_internals.arc_eager import ArcEager
-
-from ._parser_internals.arc_eager cimport ArcEager
-from .transition_parser cimport Parser
-
 from ..language import Language
 from ..scorer import Scorer
 from ..training import remove_bilu_prefix
 from ..util import registry
 from ._parser_internals import nonproj
+from ._parser_internals.arc_eager import ArcEager
 from ._parser_internals.nonproj import DELIMITER
+from ._parser_internals.transition_system import TransitionSystem
 from .functions import merge_subtokens
+from .transition_parser import Parser
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index a1bcb98455c..046ef19c3d5 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -1,12 +1,12 @@
 from collections import Counter
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model
-from thinc.types import ArrayXd, Floats2d, Ints1d
+from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
 from ..errors import Errors
@@ -19,10 +19,6 @@
 from .lemmatizer import lemmatizer_score
 from .trainable_pipe import TrainablePipe
 
-# The cutoff value of *top_k* above which an alternative method is used to process guesses.
-TOP_K_GUARDRAIL = 20
-
-
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
 
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index a1007abbffe..629a5f193aa 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,44 +1,26 @@
-import warnings
-from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast
-from numpy import dtype
-from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
-from pathlib import Path
-from itertools import islice
-import srsly
 import random
+import warnings
 from itertools import islice
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Union, cast
 
 import srsly
+from numpy import dtype
 from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate
-from thinc.types import Floats2d
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 
-from ..kb import KnowledgeBase, Candidate
-from ..tokens import Doc, Span
-from ..ml import empty_kb
-from ..tokens import Doc, Span, SpanGroup
-from .pipe import deserialize_config
-from .trainable_pipe import TrainablePipe
-from ..language import Language
-from ..vocab import Vocab
-from ..training import Example, validate_examples, validate_get_examples
-from ..errors import Errors, Warnings
-from ..util import SimpleFrozenList, registry
 from .. import util
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..kb import Candidate, KnowledgeBase
 from ..language import Language
 from ..scorer import Scorer
-from ..tokens import Doc, Span
+from ..tokens import Doc, Span, SpanGroup
 from ..training import Example, validate_examples, validate_get_examples
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
-from .legacy.entity_linker import EntityLinker_v1
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 
-
 ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
 
 KNOWLEDGE_BASE_IDS = "kb_ids"
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 5e7d0720a40..7259fc02699 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,32 +1,30 @@
 # cython: infer_types=True, profile=True, binding=True
+from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Union
+
 import srsly
-from thinc.api import Model, Config
+from thinc.api import Config, Model
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
-from itertools import islice
-from typing import Callable, Dict, Optional, Union
-
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 
 from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 
-from ..parts_of_speech import IDS as POS_IDS
-from ..symbols import POS
-from ..language import Language
-from ..errors import Errors
-from .pipe import deserialize_config
-from .tagger import ActivationsT, Tagger
 from .. import util
 from ..errors import Errors
 from ..language import Language
 from ..parts_of_speech import IDS as POS_IDS
 from ..scorer import Scorer
+from ..symbols import POS
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import Tagger
+from .pipe import deserialize_config
+from .tagger import ActivationsT, Tagger
+
+# See #9050
+BACKWARD_OVERWRITE = True
+BACKWARD_EXTEND = False
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.py
index 2c5fd89cc5d..1c7cf151385 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.py
@@ -1,25 +1,16 @@
 # cython: infer_types=True, binding=True
 from collections import defaultdict
-from typing import Callable, Optional
+from typing import Callable, Iterable, Optional
 
 from thinc.api import Config, Model
 
-from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
-from ._parser_internals.ner import BiluoPushDown
-from ..language import Language
-from ..scorer import get_ner_prf, PRFScore
-from ..training import validate_examples
-from ..util import registry
-from ..training import remove_bilu_prefix
-
-from ._parser_internals.ner cimport BiluoPushDown
-from .transition_parser cimport Parser
-
 from ..language import Language
-from ..scorer import get_ner_prf
-from ..training import remove_bilu_prefix
+from ..scorer import PRFScore, get_ner_prf
+from ..training import remove_bilu_prefix, validate_examples
 from ..util import registry
+from ._parser_internals.ner import BiluoPushDown
+from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index af7cd09f171..7bc6735a802 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -1,6 +1,6 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
 import warnings
-from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
+from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union
 
 import srsly
 
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 02b92e87812..6dd62ed8577 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
 from typing import Callable, List, Optional
 
 import srsly
@@ -7,9 +7,11 @@ from ..tokens.doc cimport Doc
 
 from .. import util
 from ..language import Language
+from ..scorer import Scorer
 from .pipe import Pipe
 from .senter import senter_score
 
+
 @Language.factory(
     "sentencizer",
     assigns=["token.is_sent_start", "doc.sents"],
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index ba45df28400..42615e194e0 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,25 +1,21 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Dict, Iterable, Optional, Callable, List, Union
 from itertools import islice
-from typing import Callable, Optional
+from typing import Callable, Dict, Iterable, List, Optional, Union
 
 import srsly
-from thinc.api import Model, Config
+from thinc.api import Config, Model
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
-
 from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
 
-from .tagger import ActivationsT, Tagger
-from ..language import Language
+from .. import util
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import Tagger
-
+from .tagger import ActivationsT, Tagger
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index 4875c5e4bff..cd8fea36b47 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -17,20 +17,12 @@
 
 import srsly
 
-from .pipe import Pipe
-from ..training import Example
-from ..language import Language
-from ..errors import Errors, Warnings
-from ..util import ensure_path, SimpleFrozenList, registry
-from ..tokens import Doc, Span
-from ..scorer import Scorer, get_ner_prf
-from ..matcher import Matcher, PhraseMatcher
 from .. import util
 from ..errors import Errors, Warnings
 from ..language import Language
 from ..matcher import Matcher, PhraseMatcher
 from ..matcher.levenshtein import levenshtein_compare
-from ..scorer import Scorer
+from ..scorer import Scorer, get_ner_prf
 from ..tokens import Doc, Span
 from ..training import Example
 from ..util import SimpleFrozenList, ensure_path, registry
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 5c450f36a33..72fd78f461e 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,8 +1,18 @@
-from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
-from typing import Union, Protocol, runtime_checkable
-from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
-from thinc.api import Optimizer
-from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
+from dataclasses import dataclass
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Protocol,
+    Tuple,
+    Union,
+    cast,
+    runtime_checkable,
+)
 
 import numpy
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 8740058174a..f3d0527ea0b 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,29 +1,29 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Callable, Dict, Iterable, List, Optional, Union
-from typing import Tuple
-import numpy
-import srsly
-from thinc.api import Model, set_dropout_rate, Config
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
 import warnings
 from itertools import islice
-from typing import Callable, Optional
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
+import srsly
+from thinc.api import Config, Model, set_dropout_rate
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d, Ints1d
 
+from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
+from ..vocab cimport Vocab
 
 from .. import util
-from ..errors import Errors
+from ..attrs import ID, POS
+from ..errors import Errors, Warnings
 from ..language import Language
+from ..parts_of_speech import X
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
+from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 
-
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
 default_model_config = """
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 6cb33109891..13841dd7bbb 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,9 +1,5 @@
-from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
-from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
-from thinc.types import Floats2d
-import numpy
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy
 from thinc.api import Config, Model, Optimizer, get_array_module, set_dropout_rate
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 9ed9770086c..309b9a84443 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,9 +1,5 @@
-from typing import Iterable, Optional, Dict, List, Callable, Any, Union
-from thinc.types import Floats2d
-from thinc.api import Model, Config
-
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
 from thinc.api import Config, Model
 from thinc.types import Floats2d
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index f168aee2ec4..92aec22b7a7 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -1,10 +1,8 @@
-from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple
-from thinc.api import Model, set_dropout_rate, Optimizer, Config
-from thinc.types import Floats2d
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
 
 from thinc.api import Config, Model, Optimizer, set_dropout_rate
+from thinc.types import Floats2d
 
 from ..errors import Errors
 from ..language import Language
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 97442a1aa97..e7cf566a113 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -1,19 +1,16 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+import warnings
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 
 import srsly
-from thinc.api import set_dropout_rate, Model, Optimizer
-import warnings
+from thinc.api import Model, Optimizer, set_dropout_rate
 
 from ..tokens.doc cimport Doc
 
-from ..training import validate_examples, validate_distillation_examples
-from ..errors import Errors, Warnings
-from .pipe import Pipe, deserialize_config
 from .. import util
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..language import Language
-from ..training import Example, validate_examples
+from ..training import Example, validate_distillation_examples, validate_examples
 from ..vocab import Vocab
 from .pipe import Pipe, deserialize_config
 
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index ef2e3314e85..d521aeced7f 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,49 +1,61 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
+
 from typing import Dict, Iterable, List, Optional, Tuple
-from cymem.cymem cimport Pool
+
 cimport numpy as np
 from cymem.cymem cimport Pool
 
 from itertools import islice
 
 from libc.stdlib cimport calloc, free
-from libc.string cimport memset
+from libc.string cimport memcpy, memset
 from libcpp.vector cimport vector
 
-import random
 import contextlib
+import random
+import warnings
 
-import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
-from thinc.api import chain, softmax_activation, use_ops, get_array_module
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
-import numpy.random
 import numpy
 import numpy.random
 import srsly
-from thinc.api import CupyOps, NumpyOps, set_dropout_rate
+from thinc.api import (
+    CupyOps,
+    NumpyOps,
+    Optimizer,
+    chain,
+    get_array_module,
+    get_ops,
+    set_dropout_rate,
+    softmax_activation,
+    use_ops,
+)
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d, Ints1d
 
 from ..ml.tb_framework import TransitionModelInputs
-from ._parser_internals.stateclass cimport StateC, StateClass
-from ._parser_internals.search cimport Beam
+
 from ..tokens.doc cimport Doc
-from .trainable_pipe cimport TrainablePipe
 from ._parser_internals cimport _beam_utils
+from ._parser_internals.search cimport Beam
+from ._parser_internals.stateclass cimport StateC, StateClass
+from .trainable_pipe cimport TrainablePipe
+
 from ._parser_internals import _beam_utils
+
+from ..typedefs cimport weight_t
 from ..vocab cimport Vocab
 from ._parser_internals.transition_system cimport Transition, TransitionSystem
-from ..typedefs cimport weight_t
 
-from ..training import validate_examples, validate_get_examples
-from ..training import validate_distillation_examples
-from ..errors import Errors, Warnings
 from .. import util
-from ..errors import Errors
-from ..training import validate_examples, validate_get_examples
-from ._parser_internals import _beam_utils
+from ..errors import Errors, Warnings
+from ..training import (
+    validate_distillation_examples,
+    validate_examples,
+    validate_get_examples,
+)
+
 
 # TODO: Remove when we switch to Cython 3.
 cdef extern from "<algorithm>" namespace "std" nogil:
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 7fc5ec20e51..4372e3f5e2e 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,12 +1,3 @@
-from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
-from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
-from enum import Enum
-from pydantic import BaseModel, Field, ValidationError, validator, create_model
-from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
-from pydantic.main import ModelMetaclass
-from thinc.api import Optimizer, ConfigValidationError, Model
-from thinc.config import Promise
-from collections import defaultdict
 import inspect
 import re
 from collections import defaultdict
@@ -18,6 +9,7 @@
     Dict,
     Iterable,
     List,
+    Literal,
     Optional,
     Tuple,
     Type,
@@ -25,34 +17,19 @@
     Union,
 )
 
-try:
-    from pydantic.v1 import (
-        BaseModel,
-        ConstrainedStr,
-        Field,
-        StrictBool,
-        StrictFloat,
-        StrictInt,
-        StrictStr,
-        ValidationError,
-        create_model,
-        validator,
-    )
-    from pydantic.v1.main import ModelMetaclass
-except ImportError:
-    from pydantic import (  # type: ignore
-        BaseModel,
-        ConstrainedStr,
-        Field,
-        StrictBool,
-        StrictFloat,
-        StrictInt,
-        StrictStr,
-        ValidationError,
-        create_model,
-        validator,
-    )
-    from pydantic.main import ModelMetaclass  # type: ignore
+from pydantic import (
+    BaseModel,
+    ConstrainedStr,
+    Field,
+    StrictBool,
+    StrictFloat,
+    StrictInt,
+    StrictStr,
+    ValidationError,
+    create_model,
+    validator,
+)
+from pydantic.main import ModelMetaclass
 from thinc.api import ConfigValidationError, Model, Optimizer
 from thinc.config import Promise
 
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index b734a707c54..c05731c9a15 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,8 +1,5 @@
-from libc.stdint cimport int64_t, uint32_t
-from libcpp.vector cimport vector
-from libcpp.set cimport set
 from cymem.cymem cimport Pool
-from libc.stdint cimport int64_t
+from libc.stdint cimport int64_t, uint32_t
 from libcpp.set cimport set
 from libcpp.vector cimport vector
 from murmurhash.mrmr cimport hash64
diff --git a/spacy/strings.pyi b/spacy/strings.pyi
index 393661f591d..98224fcd449 100644
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@@ -1,6 +1,5 @@
-from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overload
 from pathlib import Path
-from typing import Any, Iterable, Iterator, Optional, Union, overload
+from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union, overload
 
 class StringStore:
     def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ...
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 73e4c46ed46..43826f07c44 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,7 +1,10 @@
 # cython: infer_types=True
-from typing import Optional, Union, Iterable, Tuple, Callable, Any, List, Iterator
+from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
+
 cimport cython
 from libc.stdint cimport uint32_t
+from libc.string cimport memcpy
+from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 import srsly
@@ -14,7 +17,6 @@ from .symbols import IDS as SYMBOLS_BY_STR
 from .symbols import NAMES as SYMBOLS_BY_INT
 
 
-
 cdef class StringStore:
     """Look up strings by 64-bit hashes. Implicitly handles reserved symbols.
 
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index fdc9f192c2f..28551f9ee63 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -1,11 +1,11 @@
-import pytest
-from spacy.util import get_lang_class
 import functools
-from hypothesis import settings
-import inspect
 import importlib
+import inspect
 import sys
 
+import pytest
+from hypothesis import settings
+
 from spacy.util import get_lang_class
 
 # Functionally disable deadline settings for tests
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 0b05ca7c123..cf850a2234d 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -6,7 +6,6 @@
 from spacy.attrs import LENGTH, ORTH
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span, SpanGroup, Token
-from spacy.vocab import Vocab
 from spacy.util import filter_spans
 from spacy.vocab import Vocab
 
diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index ca5c2ad3959..3ab7de76323 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -4,6 +4,7 @@
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
+
 # Helper functions
 def _get_tuple(s: Span):
     return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
index 23fc8164412..0983159b75d 100644
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@@ -1,11 +1,14 @@
 # cython: infer_types=True, binding=True
+from cymem.cymem cimport Pool
+
 from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
 from spacy.typedefs cimport class_t, weight_t
-from cymem.cymem cimport Pool
 
-from ..conftest import cytest
 import pytest
 
+from ..conftest import cytest
+
+
 cdef struct TestState:
     int length
     int x
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 2c520b7daf6..f0efc3a63fd 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -3,6 +3,7 @@
 
 import pytest
 from numpy.testing import assert_equal
+from thinc.api import fix_random_seed
 
 from spacy import registry, util
 from spacy.attrs import ENT_IOB
@@ -16,8 +17,6 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
-from thinc.api import fix_random_seed
-import logging
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 4c709932bb1..636bb887789 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,17 +1,19 @@
 import itertools
-import pytest
+
 import numpy
+import pytest
 from numpy.testing import assert_equal
-from thinc.api import Adam
+from thinc.api import Adam, fix_random_seed
 
 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.training import Example
+from spacy.pipeline import DependencyParser
+from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.tokens import Doc
+from spacy.training import Example
 from spacy.vocab import Vocab
-from spacy import util, registry
-from thinc.api import fix_random_seed
 
 from ..util import apply_transition_sequence, make_tempdir
 
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index 0f204ead477..7465c844492 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -1,5 +1,5 @@
-from typing import cast
 import pickle
+from typing import cast
 
 import hypothesis.strategies as st
 import pytest
@@ -10,7 +10,6 @@
 from spacy.language import Language
 from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
 from spacy.pipeline.trainable_pipe import TrainablePipe
-from spacy.training import Example
 from spacy.strings import StringStore
 from spacy.training import Example
 from spacy.util import make_tempdir
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 7b597424a34..e44fef2ad25 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,4 +1,4 @@
-from typing import Callable, Iterable, Dict, Any, cast
+from typing import Any, Callable, Dict, Iterable, cast
 
 import pytest
 from numpy.testing import assert_equal
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 6bff3288dc3..520012c5075 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -2,16 +2,10 @@
 from thinc.api import NumpyOps, get_current_ops
 
 from spacy import registry
-from spacy.tokens import Doc, Span
-from spacy.language import Language
-from spacy.lang.en import English
-from spacy.pipeline import EntityRecognizer, merge_entities
-from spacy.pipeline import SpanRuler
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import EntityRecognizer, EntityRuler, SpanRuler, merge_entities
+from spacy.pipeline import EntityRecognizer, SpanRuler, merge_entities
 from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.tests.util import make_tempdir
 from spacy.tokens import Doc, Span
diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py
index 9854b391e60..6dd4114f1cd 100644
--- a/spacy/tests/pipeline/test_initialize.py
+++ b/spacy/tests/pipeline/test_initialize.py
@@ -1,10 +1,5 @@
 import pytest
-
-try:
-    from pydantic.v1 import StrictBool
-except ImportError:
-    from pydantic import StrictBool  # type: ignore
-
+from pydantic import StrictBool
 from thinc.api import ConfigValidationError
 
 from spacy.lang.en import English
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index fffb7b4ed7f..542d14d1516 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -1,4 +1,5 @@
 from typing import cast
+
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import get_current_ops
@@ -9,7 +10,7 @@
 from spacy.language import Language
 from spacy.morphology import Morphology
 from spacy.pipeline import TrainablePipe
-from spacy.attrs import MORPH
+from spacy.tests.util import make_tempdir
 from spacy.tokens import Doc
 from spacy.training import Example
 
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index c45dccb0624..9e1382ebd8c 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -1,4 +1,6 @@
 import pytest
+from pydantic import StrictInt, StrictStr
+from thinc.api import ConfigValidationError, Linear, Model
 
 try:
     from pydantic.v1 import StrictInt, StrictStr
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 94285178310..51f943898f1 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -1,4 +1,5 @@
 from typing import cast
+
 import pytest
 from numpy.testing import assert_equal
 
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 5dcc2e70f67..42eb90a1bb1 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -1,6 +1,7 @@
 import numpy
-from numpy.testing import assert_array_equal, assert_almost_equal
-from thinc.api import get_current_ops, Ragged, fix_random_seed
+import pytest
+from numpy.testing import assert_almost_equal, assert_array_equal
+from thinc.api import NumpyOps, Ragged, fix_random_seed, get_current_ops
 
 from spacy import util
 from spacy.lang.en import English
@@ -8,7 +9,7 @@
 from spacy.tokens import SpanGroup
 from spacy.tokens.span_groups import SpanGroups
 from spacy.training import Example
-from spacy.util import registry, make_tempdir
+from spacy.util import make_tempdir, registry
 
 OPS = get_current_ops()
 
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index b6f94f7f97b..05e814f0733 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,4 +1,5 @@
 from typing import cast
+
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import compounding, get_current_ops
@@ -8,7 +9,7 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
-from thinc.api import compounding
+from spacy.training import Example
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 2383c36bb01..3f2d757eebc 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,5 +1,5 @@
-from typing import cast
 import random
+from typing import cast
 
 import numpy.random
 import pytest
@@ -13,12 +13,16 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TextCategorizer, TrainablePipe
-from spacy.pipeline.textcat import single_label_bow_config
-from spacy.pipeline.textcat import single_label_cnn_config
-from spacy.pipeline.textcat import single_label_default_config
-from spacy.pipeline.textcat_multilabel import multi_label_bow_config
-from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
-from spacy.pipeline.textcat_multilabel import multi_label_default_config
+from spacy.pipeline.textcat import (
+    single_label_bow_config,
+    single_label_cnn_config,
+    single_label_default_config,
+)
+from spacy.pipeline.textcat_multilabel import (
+    multi_label_bow_config,
+    multi_label_cnn_config,
+    multi_label_default_config,
+)
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, DocBin
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index eb0dcc1e38c..646ce0f5d48 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -5,14 +5,25 @@
 import spacy
 from spacy.lang.de import German
 from spacy.lang.en import English
-from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
-from spacy.language import DEFAULT_CONFIG_DISTILL_PATH
-from spacy.language import Language
-from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
-from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
+from spacy.language import (
+    DEFAULT_CONFIG,
+    DEFAULT_CONFIG_DISTILL_PATH,
+    DEFAULT_CONFIG_PRETRAIN_PATH,
+    Language,
+)
+from spacy.ml.models import (
+    MaxoutWindowEncoder,
+    MultiHashEmbed,
+    build_tb_parser_model,
+    build_Tok2Vec_model,
+)
 from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
-from spacy.util import load_config, load_config_from_str
-from spacy.util import load_model_from_config, registry
+from spacy.util import (
+    load_config,
+    load_config_from_str,
+    load_model_from_config,
+    registry,
+)
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 39fbbf58217..d5f2f13af4f 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -8,9 +8,14 @@
 from spacy import Vocab, load, registry
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import DependencyParser, EntityRecognizer
-from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer
-from spacy.pipeline import TrainablePipe
+from spacy.pipeline import (
+    DependencyParser,
+    EntityRecognizer,
+    SentenceRecognizer,
+    Tagger,
+    TextCategorizer,
+    TrainablePipe,
+)
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 7b729d78f21..a47f03e8ab4 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,19 +1,31 @@
 import math
 import os
+import time
 from collections import Counter
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 
+import numpy
 import pytest
 import srsly
 from click import NoSuchOption
 from packaging.specifiers import SpecifierSet
-from thinc.api import Config
+from thinc.api import Config, ConfigValidationError
 
 import spacy
 from spacy import about
-from spacy.cli import download_module, info
-from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory
+from spacy.cli import info
+from spacy.cli._util import (
+    download_file,
+    is_subpath_of,
+    load_project_config,
+    parse_config_overrides,
+    string_to_list,
+    substitute_project_variables,
+    upload_file,
+    validate_project_commands,
+    walk_directory,
+)
 from spacy.cli.apply import apply
 from spacy.cli.debug_data import (
     _compile_gold,
@@ -31,6 +43,8 @@
 from spacy.cli.init_config import RECOMMENDATIONS, fill_config, init_config
 from spacy.cli.init_pipeline import _init_labels
 from spacy.cli.package import _is_permitted_package_name, get_third_party_dependencies
+from spacy.cli.project.remote_storage import RemoteStorage
+from spacy.cli.project.run import _check_requirements
 from spacy.cli.validate import get_model_pkgs
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 1789d60ea4c..32ca639b37d 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -7,7 +7,7 @@
 from typer.testing import CliRunner
 
 from spacy.cli._util import app, get_git_version
-from spacy.tokens import Doc, DocBin, Span
+from spacy.tokens import Doc, DocBin
 
 from .util import make_tempdir, normalize_whitespace
 
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index e4b06893c93..25352d2bb16 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -4,7 +4,7 @@
 from unittest import mock
 
 import pytest
-from thinc.api import CupyOps, NumpyOps, get_current_ops
+from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
 
 import spacy
 from spacy.lang.de import German
@@ -13,12 +13,14 @@
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, Span
 from spacy.training import Example
-from spacy.lang.en import English
-from spacy.lang.de import German
-from spacy.util import registry, ignore_error, raise_error, find_matching_language
-from spacy.util import load_model_from_config
-import spacy
-from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
+from spacy.util import (
+    find_matching_language,
+    ignore_error,
+    load_model_from_config,
+    raise_error,
+    registry,
+)
+from spacy.vocab import Vocab
 
 from .util import add_vecs_to_vocab, assert_docs_equal
 
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 160e2ecf335..c05ef625e11 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,13 +1,19 @@
 import ctypes
 import os
 from pathlib import Path
-from spacy.about import __version__ as spacy_version
-from spacy import util
-from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
-from spacy.util import find_available_port
-from thinc.api import Config, Optimizer, ConfigValidationError
-from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
+
+import pytest
+from pydantic import ValidationError
+from thinc.api import (
+    Config,
+    ConfigValidationError,
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+    set_current_ops,
+)
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
diff --git a/spacy/tests/test_symbols.py b/spacy/tests/test_symbols.py
index fb034accac2..2c2fcef755e 100644
--- a/spacy/tests/test_symbols.py
+++ b/spacy/tests/test_symbols.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.symbols import IDS, NAMES
 
 V3_SYMBOLS = {
diff --git a/spacy/tests/training/test_loop.py b/spacy/tests/training/test_loop.py
index 46d01509504..9140421b46b 100644
--- a/spacy/tests/training/test_loop.py
+++ b/spacy/tests/training/test_loop.py
@@ -1,11 +1,13 @@
 from typing import Callable, Iterable, Iterator
+
 import pytest
+from thinc.api import Config
+
 from spacy import Language
 from spacy.training import Example
 from spacy.training.initialize import init_nlp_student
 from spacy.training.loop import distill, train
 from spacy.util import load_model_from_config, registry
-from thinc.api import Config
 
 
 @pytest.fixture
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index ef20ec365c6..e8a19947606 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -8,10 +8,17 @@
 import spacy
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
-from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
-from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
-from spacy.training import offsets_to_biluo_tags, validate_distillation_examples
-from spacy.training.alignment_array import AlignmentArray
+from spacy.training import (
+    Alignment,
+    Corpus,
+    Example,
+    biluo_tags_to_offsets,
+    biluo_tags_to_spans,
+    docs_to_json,
+    iob_to_biluo,
+    offsets_to_biluo_tags,
+    validate_distillation_examples,
+)
 from spacy.training.align import get_alignments
 from spacy.training.alignment_array import AlignmentArray
 from spacy.training.converters import json_to_docs
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 58d30c3202f..b2e50969462 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -2,12 +2,7 @@ from cymem.cymem cimport Pool
 from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 
-from .typedefs cimport hash_t
-from .structs cimport LexemeC, SpanC, TokenC
-from .tokens.doc cimport Doc
-from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
-from .strings cimport StringStore
 from .structs cimport LexemeC, SpanC, TokenC
 from .tokens.doc cimport Doc
 from .typedefs cimport hash_t
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 6b157d599f1..1fc5f310920 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -1,4 +1,4 @@
-# cython: embedsignature=True, binding=True
+# cython: embedsignature=True, profile=True, binding=True
 cimport cython
 from cymem.cymem cimport Pool
 from cython.operator cimport dereference as deref
@@ -9,17 +9,11 @@ from preshed.maps cimport PreshMap
 
 import re
 
-from .tokens.doc cimport Doc
-from .strings cimport hash_string
 from .lexeme cimport EMPTY_LEXEME
 from .strings cimport hash_string
 from .tokens.doc cimport Doc
 
-from .attrs import intify_attrs
-from .symbols import ORTH, NORM
-from .errors import Errors
 from . import util
-from .util import get_words_and_spaces
 from .attrs import intify_attrs
 from .errors import Errors
 from .scorer import Scorer
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index e5a244360e3..7617e462fde 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -1,9 +1,9 @@
 from ._serialize import DocBin
 from .doc import Doc
+from .doc_bin import DocBin
 from .morphanalysis import MorphAnalysis
 from .span import Span
 from .span_group import SpanGroup
-from .doc_bin import DocBin
-from .morphanalysis import MorphAnalysis
+from .token import Token
 
 __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 2b39d5baa28..dc7c0143029 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -8,7 +8,6 @@ from typing import (
     List,
     Optional,
     Protocol,
-    Sequence,
     Tuple,
     Union,
     overload,
@@ -17,20 +16,15 @@ from typing import (
 import numpy as np
 from cymem.cymem import Pool
 from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
-from .span import Span
-from .token import Token
-from .span_groups import SpanGroups
-from .retokenizer import Retokenizer
+
 from ..lexeme import Lexeme
 from ..vocab import Vocab
-from ._dict_proxies import SpanGroups
-from ._retokenize import Retokenizer
+from .retokenizer import Retokenizer
 from .span import Span
+from .span_groups import SpanGroups
 from .token import Token
 from .underscore import Underscore
 
-DOCBIN_ALL_ATTRS: Tuple[str, ...]
-
 class DocMethod(Protocol):
     def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ...  # type: ignore[misc]
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 3880920a8b9..ff1a0d310d1 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -20,15 +20,8 @@ from thinc.util import copy_array
 
 from .span cimport Span
 from .token cimport MISSING_DEP
-from .span_groups import SpanGroups
-from .token cimport Token
-from ..lexeme cimport Lexeme, EMPTY_LEXEME
-from ..typedefs cimport attr_t, flags_t
-from ..attrs cimport attr_id_t
-from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
-from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 
-from ._dict_proxies import SpanGroups
+from .span_groups import SpanGroups
 
 from ..attrs cimport (
     DEP,
@@ -42,7 +35,6 @@ from ..attrs cimport (
     LENGTH,
     MORPH,
     NORM,
-    ORTH,
     POS,
     SENT_START,
     SPACY,
@@ -50,22 +42,17 @@ from ..attrs cimport (
     attr_id_t,
 )
 from ..lexeme cimport EMPTY_LEXEME, Lexeme
-from ..typedefs cimport attr_t
+from ..typedefs cimport attr_t, flags_t
 from .token cimport Token
 
 from .. import parts_of_speech, schemas, util
 from ..attrs import IDS, intify_attr
-from ..compat import copy_reg
+from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
 from ..morphology import Morphology
-from .. import util
-from .. import parts_of_speech
-from .. import schemas
-from .underscore import Underscore, get_ext_args
-from .retokenizer import Retokenizer
-from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from ..util import get_words_and_spaces
-from ._retokenize import Retokenizer
+from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
+from .retokenizer import Retokenizer
 from .underscore import Underscore, get_ext_args
 
 DEF PADDING = 5
diff --git a/spacy/tokens/doc_bin.py b/spacy/tokens/doc_bin.py
index 8a08864d46e..4dda40a05ee 100644
--- a/spacy/tokens/doc_bin.py
+++ b/spacy/tokens/doc_bin.py
@@ -10,7 +10,9 @@
 from ..attrs import IDS, ORTH, SPACY, intify_attr
 from ..compat import copy_reg
 from ..errors import Errors
-from ..util import ensure_path, SimpleFrozenList
+from ..util import SimpleFrozenList, ensure_path
+from ..vocab import Vocab
+from .doc import Doc
 from .span_groups import SpanGroups
 
 # fmt: off
diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx
index 22ce18181a7..7ded04500a3 100644
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@@ -1,10 +1,9 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
-# cython: profile=False
 from typing import Generator, List, Tuple
 
 cimport cython
 from cython.operator cimport dereference
-from libc.stdint cimport int32_t
+from libc.stdint cimport int32_t, int64_t
 from libcpp.pair cimport pair
 from libcpp.unordered_map cimport unordered_map
 from libcpp.unordered_set cimport unordered_set
@@ -12,12 +11,13 @@ from libcpp.unordered_set cimport unordered_set
 import weakref
 
 from murmurhash.mrmr cimport hash64
+from preshed.maps cimport map_get_unless_missing
 
 from .. import Errors
 
-from ..typedefs cimport hash_t
 from ..strings cimport get_string_id
 from ..structs cimport EdgeC, GraphC
+from ..typedefs cimport hash_t
 
 from .token import Token
 
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
index f866488ecc2..73922c62b9b 100644
--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@@ -1,8 +1,9 @@
-from ..vocab cimport Vocab
-from ..typedefs cimport hash_t
-from ..morphology cimport MorphAnalysisC
 from libcpp.memory cimport shared_ptr
 
+from ..morphology cimport MorphAnalysisC
+from ..typedefs cimport hash_t
+from ..vocab cimport Vocab
+
 
 cdef class MorphAnalysis:
     cdef readonly Vocab vocab
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index ceaa3ecd04e..014c01a2f74 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -1,17 +1,15 @@
-# cython: profile=False
 cimport numpy as np
 from libc.string cimport memset
 
 from ..errors import Errors
 from ..morphology import Morphology
 
-from ..morphology cimport check_feature, get_by_field, list_features
+from cython.operator cimport dereference as deref
+from libcpp.memory cimport shared_ptr
+
+from ..morphology cimport MorphAnalysisC, check_feature, get_by_field, list_features
 from ..typedefs cimport attr_t, hash_t
 from ..vocab cimport Vocab
-from ..typedefs cimport hash_t, attr_t
-from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
-from libcpp.memory cimport shared_ptr
-from cython.operator cimport dereference as deref
 
 
 cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx
index c0052ca9a9a..7b6501d4442 100644
--- a/spacy/tokens/retokenizer.pyx
+++ b/spacy/tokens/retokenizer.pyx
@@ -1,6 +1,7 @@
-# cython: infer_types=True, bounds_check=False
+# cython: infer_types=True, bounds_check=False, profile=True
 from cymem.cymem cimport Pool
-from libc.string cimport memset
+from libc.stdlib cimport free, malloc
+from libc.string cimport memcpy, memset
 
 import numpy
 from thinc.api import get_array_module
@@ -9,12 +10,15 @@ from ..attrs cimport MORPH, NORM
 from ..lexeme cimport EMPTY_LEXEME, Lexeme
 from ..structs cimport LexemeC, TokenC
 from ..vocab cimport Vocab
-from .doc cimport Doc, set_children_from_heads, token_by_start
+from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start
 from .span cimport Span
 from .token cimport Token
 
 from ..attrs import intify_attrs
 from ..errors import Errors
+from ..util import SimpleFrozenDict
+from .underscore import is_writable_attr
+
 from ..strings cimport get_string_id
 
 
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index ce318ed0dfb..fb592e68bd8 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,5 +1,5 @@
-from libcpp.memory cimport shared_ptr
 cimport numpy as np
+from libcpp.memory cimport shared_ptr
 
 from ..structs cimport SpanC
 from ..typedefs cimport attr_t
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 34e734ba68f..0a4b964999d 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -3,17 +3,20 @@ cimport numpy as np
 from libc.math cimport sqrt
 from libcpp.memory cimport make_shared
 
+import copy
+import warnings
+
 import numpy
 from thinc.api import get_array_module
 
 from ..attrs cimport *
-from ..attrs cimport ORTH, attr_id_t
+from ..attrs cimport attr_id_t
 from ..lexeme cimport Lexeme
-from ..structs cimport TokenC
+from ..parts_of_speech cimport univ_pos_t
+from ..structs cimport LexemeC, TokenC
 from ..symbols cimport dep
-from ..typedefs cimport attr_t, hash_t
-from .doc cimport _get_lca_matrix, get_token_attr
-from .token cimport Token
+from ..typedefs cimport attr_t, flags_t, hash_t
+from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
 
 from ..errors import Errors, Warnings
 from ..util import normalize_slice
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 8a524926a03..bc5bb92d38c 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -1,17 +1,16 @@
-# cython: profile=False
 import struct
 import weakref
 from copy import deepcopy
-from typing import Iterable, Optional, Union
+from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
 
 import srsly
 
 from spacy.errors import Errors
 
-from .span cimport Span
-from libc.stdint cimport uint64_t, uint32_t, int32_t
 from libcpp.memory cimport make_shared
 
+from .span cimport Span
+
 
 cdef class SpanGroup:
     """A group of spans that all belong to the same Doc object. The group
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index a20b1193fab..234bbc1a789 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -2,11 +2,13 @@
 # cython: profile=False
 # Compiler crashes on memory view coercion without this. Should report bug.
 cimport numpy as np
+from cython.view cimport array as cvarray
 
 np.import_array()
 
 import warnings
 
+import numpy
 from thinc.api import get_array_module
 
 from ..attrs cimport (
@@ -27,7 +29,6 @@ from ..attrs cimport (
     LIKE_EMAIL,
     LIKE_NUM,
     LIKE_URL,
-    ORTH,
 )
 from ..lexeme cimport Lexeme
 from ..symbols cimport conj
@@ -39,6 +40,7 @@ from .. import parts_of_speech
 from ..attrs import IOB_STRINGS
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
+
 from cython.operator cimport dereference as deref
 
 
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 358b2bd806d..adfc2bb6658 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,11 +1,9 @@
-from .corpus import Corpus, JsonlCorpus  # noqa: F401
-from .example import Example, validate_examples, validate_get_examples  # noqa: F401
-from .example import validate_distillation_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
 from .callbacks import create_copy_from_base_model  # noqa: F401
 from .corpus import Corpus, JsonlCorpus, PlainTextCorpus  # noqa: F401
+from .example import validate_distillation_examples  # noqa: F401
 from .example import Example, validate_examples, validate_get_examples  # noqa: F401
 from .gold_io import docs_to_json, read_json_file  # noqa: F401
 from .iob_utils import (  # noqa: F401
@@ -19,28 +17,3 @@
     tags_to_entities,
 )
 from .loggers import console_logger  # noqa: F401
-
-__all__ = [
-    "Alignment",
-    "Corpus",
-    "Example",
-    "JsonlCorpus",
-    "PlainTextCorpus",
-    "biluo_tags_to_offsets",
-    "biluo_tags_to_spans",
-    "biluo_to_iob",
-    "create_copy_from_base_model",
-    "docs_to_json",
-    "dont_augment",
-    "iob_to_biluo",
-    "minibatch_by_padded_size",
-    "minibatch_by_words",
-    "offsets_to_biluo_tags",
-    "orth_variants_augmenter",
-    "read_json_file",
-    "remove_bilu_prefix",
-    "split_bilu_label",
-    "tags_to_entities",
-    "validate_get_examples",
-    "validate_examples",
-]
diff --git a/spacy/training/align.pyx b/spacy/training/align.pyx
index c68110e304f..79fec73c411 100644
--- a/spacy/training/align.pyx
+++ b/spacy/training/align.pyx
@@ -1,4 +1,3 @@
-# cython: profile=False
 import re
 from itertools import chain
 from typing import List, Tuple
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index 469bb263016..21f1b29f5a2 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,4 +1,17 @@
 import itertools
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    TypeVar,
+    Union,
+)
+
 from thinc.schedules import Schedule
 
 from ..util import minibatch, registry
diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index 21c3d56a118..c2f3b8b51fa 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -1,11 +1,9 @@
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import Callable, Optional
 
 from ..errors import Errors
+from ..language import Language
 from ..util import load_model, logger, registry
 
-if TYPE_CHECKING:
-    from ..language import Language
-
 
 @registry.callbacks("spacy.copy_from_base_model.v1")
 def create_copy_from_base_model(
diff --git a/spacy/training/converters/json_to_docs.py b/spacy/training/converters/json_to_docs.py
index 1ff7a64e09d..a78c39aea7b 100644
--- a/spacy/training/converters/json_to_docs.py
+++ b/spacy/training/converters/json_to_docs.py
@@ -1,9 +1,13 @@
 import srsly
-from ..gold_io import json_iterate, json_to_annotations
-from ..example import annotations_to_doc
-from ..example import _fix_legacy_dict_data, _parse_example_dict_data
-from ...util import load_model
+
 from ...lang.mul import MultiLanguage
+from ...util import load_model
+from ..example import (
+    _fix_legacy_dict_data,
+    _parse_example_dict_data,
+    annotations_to_doc,
+)
+from ..gold_io import json_iterate, json_to_annotations
 
 
 def json_to_docs(input_data, model=None, **kwargs):
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 4c17fc8f525..c6da5157748 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,5 +1,6 @@
 # cython: profile=False
 from collections.abc import Iterable as IterableInstance
+
 import numpy
 
 from murmurhash.mrmr cimport hash64
diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index afbdf463110..a42e8f6425b 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -1,4 +1,4 @@
-# cython: profile=False
+import json
 import warnings
 
 import srsly
@@ -6,7 +6,7 @@ import srsly
 from .. import util
 from ..errors import Warnings
 from ..tokens import Doc
-from .iob_utils import offsets_to_biluo_tags
+from .iob_utils import offsets_to_biluo_tags, tags_to_entities
 
 
 def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 61ad1c09cc0..781614c34d0 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -1,9 +1,3 @@
-from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING
-from thinc.api import Config, ConfigValidationError
-from pathlib import Path
-import srsly
-import numpy
-import tarfile
 import gzip
 import tarfile
 import warnings
@@ -15,14 +9,27 @@
 import numpy
 import srsly
 import tqdm
-from thinc.api import Config, ConfigValidationError, fix_random_seed, set_gpu_allocator
+from thinc.api import Config, ConfigValidationError
 
 from ..errors import Errors, Warnings
+from ..lookups import Lookups
 from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
-from ..util import registry, load_model_from_config, resolve_dot_names, logger
-from ..util import load_model, ensure_path, get_sourced_components
-from ..util import OOV_RANK, DEFAULT_OOV_PROB
-from ..util import set_gpu_allocator_from_config, set_seed_from_config
+from ..util import (
+    DEFAULT_OOV_PROB,
+    OOV_RANK,
+    ensure_path,
+    get_sourced_components,
+    load_model,
+    load_model_from_config,
+    logger,
+    registry,
+    resolve_dot_names,
+    set_gpu_allocator_from_config,
+    set_seed_from_config,
+)
+from ..vectors import Mode as VectorsMode
+from ..vectors import Vectors
+from .pretrain import get_tok2vec_ref
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index ad162678fec..63715ec2c42 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -3,20 +3,34 @@
 import sys
 from pathlib import Path
 from timeit import default_timer as timer
-from thinc.api import Optimizer, Config, constant
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+
+from thinc.api import Config, Optimizer, constant
 from wasabi import Printer
-import random
-import sys
-import shutil
 
-
-from .example import Example
-from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
+from .. import ty
 from ..errors import Errors
+from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
 from ..tokens.doc import Doc
-from .. import ty
-from ..util import resolve_dot_names, registry, logger
-from ..util import set_gpu_allocator_from_config, set_seed_from_config
+from ..util import (
+    logger,
+    registry,
+    resolve_dot_names,
+    set_gpu_allocator_from_config,
+    set_seed_from_config,
+)
+from .example import Example
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
diff --git a/spacy/ty.py b/spacy/ty.py
index ac09cb336ac..e4f34a5f651 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -1,5 +1,17 @@
-from typing import TYPE_CHECKING, Protocol, runtime_checkable
-from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Protocol,
+    Sequence,
+    runtime_checkable,
+)
+
+from thinc.api import Model, Optimizer
 
 if TYPE_CHECKING:
     from .language import Language
diff --git a/spacy/util.py b/spacy/util.py
index 3bb92e7334c..ae9837e3afe 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -2,13 +2,7 @@
 import importlib
 import importlib.metadata
 import importlib.util
-import re
-from pathlib import Path
-import thinc
-from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
-from thinc.api import ConfigValidationError, Model, constant as constant_schedule
-from thinc.api import fix_random_seed, set_gpu_allocator
-import functools
+import inspect
 import itertools
 import logging
 import os
@@ -55,15 +49,9 @@
 from packaging.requirements import Requirement
 from packaging.specifiers import InvalidSpecifier, SpecifierSet
 from packaging.version import InvalidVersion, Version
-from thinc.api import (
-    Adam,
-    Config,
-    ConfigValidationError,
-    Model,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-)
+from thinc.api import Adam, Config, ConfigValidationError, Model, NumpyOps, Optimizer
+from thinc.api import constant as constant_schedule
+from thinc.api import fix_random_seed, get_current_ops, set_gpu_allocator
 
 try:
     import cupy.random
@@ -71,12 +59,9 @@
     cupy = None
 
 
-from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows
-from .errors import Errors, Warnings
 from . import about
-from .compat import CudaStream, cupy, importlib_metadata, is_windows
-from .errors import OLD_MODEL_SHORTCUTS, Errors, Warnings
+from .compat import CudaStream, cupy, is_windows
+from .errors import Errors, Warnings
 from .symbols import ORTH
 
 if TYPE_CHECKING:
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index e16efd2738d..876c56bed1d 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,15 +1,13 @@
-# cython: infer_types=True, binding=True
-from typing import Callable
-
+cimport numpy as np
 from cython.operator cimport dereference as deref
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.set cimport set as cppset
 from murmurhash.mrmr cimport hash128_x64
 
+import functools
 import warnings
 from enum import Enum
-from pathlib import Path
-from typing import TYPE_CHECKING, Union, cast
+from typing import cast
 
 import numpy
 import srsly
@@ -21,13 +19,9 @@ from .attrs cimport ORTH, attr_id_t
 from .strings cimport StringStore
 
 from . import util
-from .attrs import IDS
 from .errors import Errors, Warnings
 from .strings import get_string_id
 
-if TYPE_CHECKING:
-    from .vocab import Vocab  # noqa: F401  # no-cython-lint
-
 
 def unpickle_vectors(bytes_data):
     return Vectors().from_bytes(bytes_data)
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index f317afd8924..de543c25d88 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -1,5 +1,7 @@
 import functools
 
+import functools
+
 import numpy
 import srsly
 from thinc.api import get_array_module, get_current_ops
@@ -16,6 +18,7 @@ from .errors import Errors
 from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
 from .lang.norm_exceptions import BASE_NORMS
 from .lookups import Lookups
+from .util import registry
 from .vectors import Mode as VectorsMode
 from .vectors import Vectors
 

From fee9f70507180d7ef9927d3cf47554493c0b8482 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 12:43:21 +0200
Subject: [PATCH 296/504] Fix span <-> underscore import cycle

---
 spacy/tokens/underscore.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index 63706851286..c3e3641d454 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -3,10 +3,10 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 from ..errors import Errors
-from .span import Span
 
 if TYPE_CHECKING:
     from .doc import Doc
+    from .span import Span
     from .token import Token
 
 
@@ -40,7 +40,10 @@ def __init__(
         object.__setattr__(self, "_doc", obj.doc)
         object.__setattr__(self, "_start", start)
         object.__setattr__(self, "_end", end)
-        if type(obj) == Span:
+        # We used to check if obj is a span, however, this introduces an
+        # import cycle between the span and underscore modeles. So we
+        # do a structural type check instead.
+        if hasattr(obj, "id") and hasattr(obj, "label") and hasattr(obj, "kb_id"):
             object.__setattr__(self, "_label", label)
             object.__setattr__(self, "_kb_id", kb_id)
             object.__setattr__(self, "_span_id", span_id)

From 02825c8b48567b7ea5423bd4666d360df2384c09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 12:43:45 +0200
Subject: [PATCH 297/504] Fix training.callbacks <-> language import cycle

---
 spacy/training/callbacks.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index c2f3b8b51fa..21c3d56a118 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -1,9 +1,11 @@
-from typing import Callable, Optional
+from typing import TYPE_CHECKING, Callable, Optional
 
 from ..errors import Errors
-from ..language import Language
 from ..util import load_model, logger, registry
 
+if TYPE_CHECKING:
+    from ..language import Language
+
 
 @registry.callbacks("spacy.copy_from_base_model.v1")
 def create_copy_from_base_model(

From 85e08ab5a323e61872e6e9cb90580a77c97e17e0 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 6 Jul 2023 15:20:13 +0200
Subject: [PATCH 298/504] Disallow False for first/last arguments of add_pipe
 (#12793)

* Literal True for first/last options

* add test case

* update docs

* remove old redundant test case

* black formatting

* use Optional typing in docstrings

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>

---------

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 spacy/errors.py                           |  1 +
 spacy/language.py                         | 20 ++++++++++++--------
 spacy/tests/pipeline/test_pipe_methods.py | 18 ++++++++++++++++--
 website/docs/api/language.mdx             |  7 ++++---
 4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 4909371d549..2ddaef19bca 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -989,6 +989,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
              "{existing_value}.")
     E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
+    E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
 
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
diff --git a/spacy/language.py b/spacy/language.py
index ea641224684..5b2652db53b 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -763,8 +763,8 @@ def add_pipe(
         *,
         before: Optional[Union[str, int]] = None,
         after: Optional[Union[str, int]] = None,
-        first: Optional[bool] = None,
-        last: Optional[bool] = None,
+        first: Optional[Literal[True]] = None,
+        last: Optional[Literal[True]] = None,
         source: Optional["Language"] = None,
         config: Dict[str, Any] = SimpleFrozenDict(),
         raw_config: Optional[Config] = None,
@@ -783,8 +783,8 @@ def add_pipe(
             component directly before.
         after (Union[str, int]): Name or index of the component to insert new
             component directly after.
-        first (bool): If True, insert component first in the pipeline.
-        last (bool): If True, insert component last in the pipeline.
+        first (Optional[Literal[True]]): If True, insert component first in the pipeline.
+        last (Optional[Literal[True]]): If True, insert component last in the pipeline.
         source (Language): Optional loaded nlp object to copy the pipeline
             component from.
         config (Dict[str, Any]): Config parameters to use for this component.
@@ -830,18 +830,22 @@ def _get_pipe_index(
         self,
         before: Optional[Union[str, int]] = None,
         after: Optional[Union[str, int]] = None,
-        first: Optional[bool] = None,
-        last: Optional[bool] = None,
+        first: Optional[Literal[True]] = None,
+        last: Optional[Literal[True]] = None,
     ) -> int:
         """Determine where to insert a pipeline component based on the before/
         after/first/last values.
 
         before (str): Name or index of the component to insert directly before.
         after (str): Name or index of component to insert directly after.
-        first (bool): If True, insert component first in the pipeline.
-        last (bool): If True, insert component last in the pipeline.
+        first (Optional[Literal[True]]): If True, insert component first in the pipeline.
+        last (Optional[Literal[True]]): If True, insert component last in the pipeline.
         RETURNS (int): The index of the new pipeline component.
         """
+        if first is not None and first is not True:
+            raise ValueError(Errors.E4009.format(attr="first", value=first))
+        if last is not None and last is not True:
+            raise ValueError(Errors.E4009.format(attr="last", value=last))
         all_args = {"before": before, "after": after, "first": first, "last": last}
         if sum(arg is not None for arg in [before, after, first, last]) >= 2:
             raise ValueError(
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 39611a74278..063e5bf67fd 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -189,6 +189,22 @@ def test_add_pipe_last(nlp, name1, name2):
     assert nlp.pipeline[-1][0] == name1
 
 
+@pytest.mark.parametrize("name1,name2", [("parser", "lambda_pipe")])
+def test_add_pipe_false(nlp, name1, name2):
+    Language.component("new_pipe2", func=lambda doc: doc)
+    nlp.add_pipe("new_pipe2", name=name2)
+    with pytest.raises(
+        ValueError,
+        match="The 'last' parameter should be 'None' or 'True', but found 'False'",
+    ):
+        nlp.add_pipe("new_pipe", name=name1, last=False)
+    with pytest.raises(
+        ValueError,
+        match="The 'first' parameter should be 'None' or 'True', but found 'False'",
+    ):
+        nlp.add_pipe("new_pipe", name=name1, first=False)
+
+
 def test_cant_add_pipe_first_and_last(nlp):
     with pytest.raises(ValueError):
         nlp.add_pipe("new_pipe", first=True, last=True)
@@ -411,8 +427,6 @@ def test_add_pipe_before_after():
         nlp.add_pipe("entity_ruler", before="ner", after=2)
     with pytest.raises(ValueError):
         nlp.add_pipe("entity_ruler", before=True)
-    with pytest.raises(ValueError):
-        nlp.add_pipe("entity_ruler", first=False)
 
 
 def test_disable_enable_pipes():
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 82cb1c14cef..d65ea376431 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -436,7 +436,8 @@ component factory registered using
 [`@Language.component`](/api/language#component) or
 [`@Language.factory`](/api/language#factory). Components should be callables
 that take a `Doc` object, modify it and return it. Only one of `before`,
-`after`, `first` or `last` can be set. Default behavior is `last=True`.
+`after`, `first` or `last` can be set. The arguments `first` and `last` can
+either be `None` or `True`. Default behavior is `last=True`.
 
 <Infobox title="Changed in v3.0" variant="warning">
 
@@ -471,8 +472,8 @@ component, adds it to the pipeline and returns it.
 | _keyword-only_                        |                                                                                                                                                                                                                                                                                          |
 | `before`                              | Component name or index to insert component directly before. ~~Optional[Union[str, int]]~~                                                                                                                                                                                               |
 | `after`                               | Component name or index to insert component directly after. ~~Optional[Union[str, int]]~~                                                                                                                                                                                                |
-| `first`                               | Insert component first / not first in the pipeline. ~~Optional[bool]~~                                                                                                                                                                                                                   |
-| `last`                                | Insert component last / not last in the pipeline. ~~Optional[bool]~~                                                                                                                                                                                                                     |
+| `first`                               | Insert component first in the pipeline if set to `True`. ~~Optional[Literal[True]]~~                                                                                                                                                                                                     |
+| `last`                                | Insert component last in the pipeline if set to `True`. ~~Optional[Literal[True]]~~                                                                                                                                                                                                      |
 | `config` <Tag variant="new">3</Tag>   | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Dict[str, Any]~~                                                                                                                                    |
 | `source` <Tag variant="new">3</Tag>   | Optional source pipeline to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source pipeline match the target pipeline. ~~Optional[Language]~~ |
 | `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~                                                                                                                                                           |

From aadc2920f6010c56521487b4697a534b6af4f47c Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 19 Jul 2023 16:38:29 +0200
Subject: [PATCH 299/504] merge fixes

---
 .../_parser_internals/_beam_utils.pyx         |  4 +-
 spacy/pipeline/morphologizer.pyx              |  1 -
 spacy/pipeline/transition_parser.pyx          | 27 ++++------
 spacy/tests/pipeline/test_tok2vec.py          | 54 +++++++++++++++++++
 .../tests/serialize/test_serialize_config.py  |  1 +
 spacy/tokens/span.pyx                         |  3 +-
 spacy/tokens/token.pyx                        |  2 +-
 spacy/vectors.pyx                             |  2 +-
 8 files changed, 69 insertions(+), 25 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index 7098b822ef0..7c546752d80 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -4,9 +4,7 @@ cimport numpy as np
 
 import numpy
 
-from cpython.ref cimport Py_XDECREF, PyObject
-
-from ...typedefs cimport class_t, hash_t
+from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 7259fc02699..765fd83f111 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -19,7 +19,6 @@ from ..scorer import Scorer
 from ..symbols import POS
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .pipe import deserialize_config
 from .tagger import ActivationsT, Tagger
 
 # See #9050
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index d521aeced7f..8e4bee2b3dd 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -7,15 +7,9 @@ from typing import Dict, Iterable, List, Optional, Tuple
 cimport numpy as np
 from cymem.cymem cimport Pool
 
-from itertools import islice
-
-from libc.stdlib cimport calloc, free
-from libc.string cimport memcpy, memset
-from libcpp.vector cimport vector
-
 import contextlib
 import random
-import warnings
+from itertools import islice
 
 import numpy
 import numpy.random
@@ -24,29 +18,21 @@ from thinc.api import (
     CupyOps,
     NumpyOps,
     Optimizer,
-    chain,
     get_array_module,
     get_ops,
     set_dropout_rate,
-    softmax_activation,
-    use_ops,
 )
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 
 from ..ml.tb_framework import TransitionModelInputs
 
 from ..tokens.doc cimport Doc
-from ._parser_internals cimport _beam_utils
-from ._parser_internals.search cimport Beam
-from ._parser_internals.stateclass cimport StateC, StateClass
-from .trainable_pipe cimport TrainablePipe
-
-from ._parser_internals import _beam_utils
-
 from ..typedefs cimport weight_t
 from ..vocab cimport Vocab
+from ._parser_internals cimport _beam_utils
+from ._parser_internals.stateclass cimport StateC, StateClass
 from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from .trainable_pipe cimport TrainablePipe
 
 from .. import util
 from ..errors import Errors, Warnings
@@ -62,6 +48,11 @@ cdef extern from "<algorithm>" namespace "std" nogil:
     bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
 
 
+
+# TODO: Remove when we switch to Cython 3.
+cdef extern from "<algorithm>" namespace "std" nogil:
+    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
+
 NUMPY_OPS = NumpyOps()
 
 
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index e557e294112..f6cefbc1f84 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -642,3 +642,57 @@ def tok2vec_distill_wrapper(
 
     student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec)
     student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={})
+
+
+def test_tok2vec_listener_source_link_name():
+    """The component's internal name and the tok2vec listener map correspond
+    to the most recently modified pipeline.
+    """
+    orig_config = Config().from_str(cfg_string_multi)
+    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+
+    nlp2 = English()
+    nlp2.add_pipe("tok2vec", source=nlp1)
+    nlp2.add_pipe("tagger", name="tagger2", source=nlp1)
+
+    # there is no way to have the component have the right name for both
+    # pipelines, right now the most recently modified pipeline is prioritized
+    assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2"
+
+    # there is no way to have the tok2vec have the right listener map for both
+    # pipelines, right now the most recently modified pipeline is prioritized
+    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
+    nlp2.add_pipe("ner", name="ner3", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"]
+    nlp2.remove_pipe("ner3")
+    assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
+    nlp2.remove_pipe("tagger2")
+    assert nlp2.get_pipe("tok2vec").listening_components == []
+
+    # at this point the tok2vec component corresponds to nlp2
+    assert nlp1.get_pipe("tok2vec").listening_components == []
+
+    # modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1
+    nlp1.add_pipe("sentencizer")
+    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+
+    # modifying nlp2 syncs it back to nlp2
+    nlp2.add_pipe("sentencizer")
+    assert nlp1.get_pipe("tok2vec").listening_components == []
+
+
+def test_tok2vec_listener_source_replace_listeners():
+    orig_config = Config().from_str(cfg_string_multi)
+    nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
+    nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
+    assert nlp1.get_pipe("tok2vec").listening_components == ["ner"]
+
+    nlp2 = English()
+    nlp2.add_pipe("tok2vec", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == []
+    nlp2.add_pipe("tagger", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == []
+    nlp2.add_pipe("ner", name="ner2", source=nlp1)
+    assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"]
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 646ce0f5d48..b351ea80121 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -18,6 +18,7 @@
     build_Tok2Vec_model,
 )
 from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
+from spacy.training import Example
 from spacy.util import (
     load_config,
     load_config_from_str,
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 0a4b964999d..ed85bb40da9 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -15,8 +15,9 @@ from ..lexeme cimport Lexeme
 from ..parts_of_speech cimport univ_pos_t
 from ..structs cimport LexemeC, TokenC
 from ..symbols cimport dep
-from ..typedefs cimport attr_t, flags_t, hash_t
+from ..typedefs cimport attr_t
 from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
+from .token cimport Token
 
 from ..errors import Errors, Warnings
 from ..util import normalize_slice
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 234bbc1a789..47b4898bb75 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -435,7 +435,7 @@ cdef class Token:
         if "vector" in self.doc.user_token_hooks:
             return self.doc.user_token_hooks["vector"](self)
         else:
-            return self.vocab.get_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
+            return self.vocab.get_vector(self.c.lex.orth)
 
     @property
     def vector_norm(self):
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 876c56bed1d..111a9d01e08 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -130,7 +130,7 @@ cdef class Vectors(BaseVectors):
     cdef readonly unicode eow
     cdef readonly attr_id_t attr
 
-    def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
+    def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">", attr="ORTH"):
         """Create a new vector store.
 
         strings (StringStore): The string store.

From 5a72b06b97712281d1f10db64758ca186645a0a1 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 19 Jul 2023 17:41:29 +0200
Subject: [PATCH 300/504] cython fixes and cleanup

---
 spacy/matcher/phrasematcher.pyx               |  2 -
 spacy/ml/tb_framework.pyx                     | 55 ++++++++++---------
 spacy/morphology.pyx                          |  6 +-
 spacy/parts_of_speech.pxd                     |  2 +-
 spacy/pipeline/_parser_internals/ner.pyx      |  1 -
 spacy/pipeline/_parser_internals/search.pxd   |  1 -
 spacy/pipeline/_parser_internals/search.pyx   | 12 ++--
 .../_parser_internals/transition_system.pxd   |  4 +-
 .../_parser_internals/transition_system.pyx   | 21 ++++---
 spacy/pipeline/morphologizer.pyx              |  3 +-
 spacy/pipeline/pipe.pyx                       |  5 +-
 spacy/pipeline/trainable_pipe.pyx             | 17 +++---
 spacy/pipeline/transition_parser.pyx          | 55 ++++++++++---------
 spacy/strings.pyx                             |  9 +--
 spacy/tests/parser/_search.pyx                | 49 +++++++++--------
 spacy/tokens/doc.pyx                          |  2 +-
 spacy/tokens/morphanalysis.pyx                |  1 -
 spacy/tokens/span.pyx                         |  3 +-
 18 files changed, 119 insertions(+), 129 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 107d7d926ee..d1a8eaf33c4 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -160,7 +160,6 @@ cdef class PhraseMatcher:
         del self._callbacks[key]
         del self._docs[key]
 
-
     def _add_from_arrays(self, key, specs, *, on_match=None):
         """Add a preprocessed list of specs, with an optional callback.
 
@@ -196,7 +195,6 @@ cdef class PhraseMatcher:
                 result = internal_node
             map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
 
-
     def add(self, key, docs, *, on_match=None):
         """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
         key, a list of one or more patterns, and (optionally) an on_match callback.
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index fd0af12ceab..ed04045a6a5 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -1,5 +1,5 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, TypeVar, cast
+from typing import Any, List, Optional, Tuple, cast
 
 from libc.stdlib cimport calloc, free, realloc
 from libc.string cimport memcpy, memset
@@ -23,7 +23,7 @@ from thinc.api import (
 
 from thinc.backends.cblas cimport CBlas, saxpy, sgemm
 
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
+from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
 
 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
@@ -136,7 +136,7 @@ def init(
     Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
 ):
     if X is not None:
-        docs, moves = X
+        docs, _ = X
         model.get_ref("tok2vec").initialize(X=docs)
     else:
         model.get_ref("tok2vec").initialize()
@@ -145,7 +145,7 @@ def init(
         current_nO = model.maybe_get_dim("nO")
         if current_nO is None or current_nO != inferred_nO:
             model.attrs["resize_output"](model, inferred_nO)
-    nO = model.get_dim("nO")
+    # nO = model.get_dim("nO")
     nP = model.get_dim("nP")
     nH = model.get_dim("nH")
     nI = model.get_dim("nI")
@@ -192,9 +192,10 @@ class TransitionModelInputs:
         self,
         docs: List[Doc],
         moves: TransitionSystem,
-        actions: Optional[List[Ints1d]]=None,
-        max_moves: int=0,
-        states: Optional[List[State]]=None):
+        actions: Optional[List[Ints1d]] = None,
+        max_moves: int = 0,
+        states: Optional[List[State]] = None,
+    ):
         """
         actions (Optional[List[Ints1d]]): actions to apply for each Doc.
         docs (List[Doc]): Docs to predict transition sequences for.
@@ -234,12 +235,12 @@ def forward(model, inputs: TransitionModelInputs, is_train: bool):
         return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
     else:
         return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
-            feats, backprop_feats, seen_mask, is_train, actions=actions,
-            max_moves=inputs.max_moves)
+                                 feats, backprop_feats, seen_mask, is_train, actions=actions,
+                                 max_moves=inputs.max_moves)
 
 
 def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
-                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+                        np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None):
     cdef vector[StateC*] c_states
     cdef StateClass state
     for state in states:
@@ -257,9 +258,10 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State
 
     return (states, scores), backprop
 
+
 cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
                        WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i, j
+    cdef int i
     cdef vector[StateC *] unfinished
     cdef ActivationsC activations = _alloc_activations(sizes)
     cdef np.ndarray step_scores
@@ -276,7 +278,7 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
             if actions is None:
                 # Validate actions, argmax, take action.
                 c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
-                    sizes.states)
+                                   sizes.states)
             else:
                 c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
             for i in range(sizes.states):
@@ -302,8 +304,8 @@ def _forward_fallback(
     backprop_feats,
     seen_mask,
     is_train: bool,
-    actions: Optional[List[Ints1d]]=None,
-    max_moves: int=0):
+    actions: Optional[List[Ints1d]] = None,
+        max_moves: int = 0):
     nF = model.get_dim("nF")
     output = model.get_ref("output")
     hidden_b = model.get_param("hidden_b")
@@ -371,7 +373,7 @@ def _forward_fallback(
             for clas in set(model.attrs["unseen_classes"]):
                 if (d_scores[:, clas] < 0).any():
                     model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False
+        d_scores *= seen_mask == False  # no-cython-lint
         # Calculate the gradients for the parameters of the output layer.
         # The weight gemm is (nS, nO) @ (nS, nH).T
         output.inc_grad("b", d_scores.sum(axis=0))
@@ -571,13 +573,13 @@ cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
         A._max_size = n.states
     else:
         A.token_ids = <int*>realloc(A.token_ids,
-            n.states * n.feats * sizeof(A.token_ids[0]))
+                                    n.states * n.feats * sizeof(A.token_ids[0]))
         A.unmaxed = <float*>realloc(A.unmaxed,
-            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
         A.hiddens = <float*>realloc(A.hiddens,
-            n.states * n.hiddens * sizeof(A.hiddens[0]))
+                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
         A.is_valid = <int*>realloc(A.is_valid,
-            n.states * n.classes * sizeof(A.is_valid[0]))
+                                   n.states * n.classes * sizeof(A.is_valid[0]))
         A._max_size = n.states
     A._curr_size = n.states
 
@@ -599,9 +601,9 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
     else:
         # Compute hidden-to-output
         sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-                      1.0, <const float *>A.hiddens, n.hiddens,
-                      <const float *>W.hidden_weights, n.hiddens,
-                      0.0, scores, n.classes)
+                     1.0, <const float *>A.hiddens, n.hiddens,
+                     <const float *>W.hidden_weights, n.hiddens,
+                     0.0, scores, n.classes)
         # Add bias
         for i in range(n.states):
             saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
@@ -617,12 +619,12 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
                 scores[i*n.classes+j] = min_
 
 
-cdef void _sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f, i
+cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
+                              const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f
     cdef const float* feature
     cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces
+    cdef int O = n.hiddens * n.pieces  # no-cython-lint
     cdef int F = n.feats
     cdef int T = n.tokens
     padding = cached + (T * F * O)
@@ -637,4 +639,3 @@ cdef void _sum_state_features(CBlas cblas, float* output,
                 feature = &cached[idx]
             saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
         token_ids += F
-
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index d75c1071941..e7f93b78b47 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -80,15 +80,13 @@ cdef class Morphology:
         out.sort(key=lambda x: x[0])
         return dict(out)
 
-
     def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
         norm_feats_string = self.FEATURE_SEP.join([
-                self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
+            self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
             for field, values in feats.items()
-        ])
+            ])
         return norm_feats_string or self.EMPTY_MORPH
 
-
     cdef hash_t _add(self, features):
         """Insert a morphological analysis in the morphology table, if not
         already present. The morphological analysis may be provided in the UD
diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index 01f116ea688..22a571be7b0 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -8,7 +8,7 @@ cpdef enum univ_pos_t:
     ADV = symbols.ADV
     AUX = symbols.AUX
     CONJ = symbols.CONJ
-    CCONJ  = symbols.CCONJ  # U20
+    CCONJ = symbols.CCONJ  # U20
     DET = symbols.DET
     INTJ = symbols.INTJ
     NOUN = symbols.NOUN
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 5c31ff5c21d..3a352f51ff5 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -2,7 +2,6 @@ import os
 import random
 
 from cymem.cymem cimport Pool
-from libc.stdint cimport int32_t
 from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
 
diff --git a/spacy/pipeline/_parser_internals/search.pxd b/spacy/pipeline/_parser_internals/search.pxd
index 4626496335a..ad68dc5c718 100644
--- a/spacy/pipeline/_parser_internals/search.pxd
+++ b/spacy/pipeline/_parser_internals/search.pxd
@@ -57,7 +57,6 @@ cdef class Beam:
     cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
                      void* extra_args) except -1
     cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
- 
 
     cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
         self.scores[i][j] = score
diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
index 251eaa805cb..578299b56ae 100644
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -1,11 +1,8 @@
 # cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
-from libc.math cimport exp, log
-from libc.string cimport memcpy, memset
-
-import math
-
 from cymem.cymem cimport Pool
+from libc.math cimport exp
+from libc.string cimport memcpy, memset
 from preshed.maps cimport PreshMap
 
 
@@ -70,7 +67,7 @@ cdef class Beam:
             self.costs[i][j] = costs[j]
 
     cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
-        cdef int i, j
+        cdef int i
         for i in range(self.width):
             memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
             memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
@@ -176,7 +173,6 @@ cdef class Beam:
         beam-width, and n is the number of classes.
         """
         cdef Entry entry
-        cdef weight_t score
         cdef _State* s
         cdef int i, j, move_id
         assert self.size >= 1
@@ -269,7 +265,7 @@ cdef class MaxViolation:
                 # This can happen from non-monotonic actions
                 # If we find a better gold analysis this way, be sure to keep it.
                 elif pred._states[i].loss <= 0 \
-                and tuple(pred.histories[i]) not in seen_golds:
+                        and tuple(pred.histories[i]) not in seen_golds:
                     g_scores.append(pred._states[i].score)
                     g_hist.append(list(pred.histories[i]))
             for i in range(gold.size):
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 66cc7747b69..08baed932ba 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -60,7 +60,7 @@ cdef class TransitionSystem:
 
 
 cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-    int batch_size) nogil
+                          int batch_size) nogil
 
 cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-        int nr_class, int batch_size) nogil
+                             int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index a433ce7dc75..50b155bf9bb 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -294,19 +294,19 @@ cdef class TransitionSystem:
 
 
 cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-    int batch_size) nogil:
-        cdef int i
-        cdef Transition action
-        cdef StateC* state
-        for i in range(batch_size):
-            state = states[i]
-            action = moves.c[actions[i]]
-            action.do(state, action.label)
-            state.history.push_back(action.clas)
+                          int batch_size) nogil:
+    cdef int i
+    cdef Transition action
+    cdef StateC* state
+    for i in range(batch_size):
+        state = states[i]
+        action = moves.c[actions[i]]
+        action.do(state, action.label)
+        state.history.push_back(action.clas)
 
 
 cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-    int nr_class, int batch_size) nogil:
+                             int nr_class, int batch_size) nogil:
     is_valid = <int*>calloc(moves.n_moves, sizeof(int))
     cdef int i, guess
     cdef Transition action
@@ -322,4 +322,3 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa
             action.do(states[i], action.label)
             states[i].history.push_back(guess)
     free(is_valid)
-
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 765fd83f111..669a5424412 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,8 +1,7 @@
 # cython: infer_types=True, profile=True, binding=True
 from itertools import islice
-from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Callable, Dict, Iterable, Optional, Union
 
-import srsly
 from thinc.api import Config, Model
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 7bc6735a802..8409e64c3cb 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -1,12 +1,11 @@
 # cython: infer_types=True, profile=True, binding=True
-import warnings
-from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union
+from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
 
 import srsly
 
 from ..tokens.doc cimport Doc
 
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..language import Language
 from ..training import Example
 from ..util import raise_error
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index e7cf566a113..065a6c20d62 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -1,5 +1,4 @@
 # cython: infer_types=True, profile=True, binding=True
-import warnings
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 
 import srsly
@@ -8,7 +7,7 @@ from thinc.api import Model, Optimizer, set_dropout_rate
 from ..tokens.doc cimport Doc
 
 from .. import util
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..language import Language
 from ..training import Example, validate_distillation_examples, validate_examples
 from ..vocab import Vocab
@@ -56,14 +55,14 @@ cdef class TrainablePipe(Pipe):
         except Exception as e:
             error_handler(self.name, self, [doc], e)
 
-
     def distill(self,
-               teacher_pipe: Optional["TrainablePipe"],
-               examples: Iterable["Example"],
-               *,
-               drop: float=0.0,
-               sgd: Optional[Optimizer]=None,
-               losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
+                teacher_pipe: Optional["TrainablePipe"],
+                examples: Iterable["Example"],
+                *,
+                drop: float = 0.0,
+                sgd: Optional[Optimizer] = None,
+                losses: Optional[Dict[str, float]] = None
+                ) -> Dict[str, float]:
         """Train a pipe (the student) on the predictions of another pipe
         (the teacher). The student is typically trained on the probability
         distribution of the teacher, but details may differ per pipe.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 8e4bee2b3dd..9fa0d4987b8 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -228,12 +228,13 @@ class Parser(TrainablePipe):
         raise NotImplementedError
 
     def distill(self,
-               teacher_pipe: Optional[TrainablePipe],
-               examples: Iterable["Example"],
-               *,
-               drop: float=0.0,
-               sgd: Optional[Optimizer]=None,
-               losses: Optional[Dict[str, float]]=None):
+                teacher_pipe: Optional[TrainablePipe],
+                examples: Iterable["Example"],
+                *,
+                drop: float = 0.0,
+                sgd: Optional[Optimizer] = None,
+                losses: Optional[Dict[str, float]] = None
+                ):
         """Train a pipe (the student) on the predictions of another pipe
         (the teacher). The student is trained on the transition probabilities
         of the teacher.
@@ -283,11 +284,13 @@ class Parser(TrainablePipe):
         # teacher's distributions.
 
         student_inputs = TransitionModelInputs(docs=student_docs,
-            states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
+                                               states=[state.copy() for state in states],
+                                               moves=self.moves,
+                                               max_moves=max_moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
         actions = _states_diff_to_actions(states, student_states)
         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-            states=states, moves=teacher_pipe.moves, actions=actions)
+                                               states=states, moves=teacher_pipe.moves, actions=actions)
         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@@ -300,10 +303,9 @@ class Parser(TrainablePipe):
 
         return losses
 
-
     def get_teacher_student_loss(
-        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
-        normalize: bool=False,
+            self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
+            normalize: bool = False,
     ) -> Tuple[float, List[Floats2d]]:
         """Calculate the loss and its gradient for a batch of student
         scores, relative to teacher scores.
@@ -326,9 +328,9 @@ class Parser(TrainablePipe):
         # ourselves.
 
         teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
-            axis=-1, inplace=True)
+                                                axis=-1, inplace=True)
         student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
-            axis=-1, inplace=True)
+                                                axis=-1, inplace=True)
 
         assert teacher_scores.shape == student_scores.shape
 
@@ -442,13 +444,15 @@ class Parser(TrainablePipe):
         else:
             init_states, gold_states, _ = self.moves.init_gold_batch(examples)
 
-        inputs = TransitionModelInputs(docs=docs, moves=self.moves,
-            max_moves=max_moves, states=[state.copy() for state in init_states])
+        inputs = TransitionModelInputs(docs=docs,
+                                       moves=self.moves,
+                                       max_moves=max_moves,
+                                       states=[state.copy() for state in init_states])
         (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
         if sum(s.shape[0] for s in scores) == 0:
             return losses
         d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
-            examples, max_moves)
+                                 examples, max_moves)
         backprop_scores((pred_states, d_scores))
         if sgd not in (None, False):
             self.finish_update(sgd)
@@ -489,9 +493,7 @@ class Parser(TrainablePipe):
         cdef TransitionSystem moves = self.moves
         cdef StateClass state
         cdef int clas
-        cdef int nF = self.model.get_dim("nF")
         cdef int nO = moves.n_moves
-        cdef int nS = sum([len(history) for history in histories])
         cdef Pool mem = Pool()
         cdef np.ndarray costs_i
         is_valid = <int*>mem.alloc(nO, sizeof(int))
@@ -558,8 +560,8 @@ class Parser(TrainablePipe):
 
         return losses
 
-    def update_beam(self, examples, *, beam_width,
-            drop=0., sgd=None, losses=None, beam_density=0.0):
+    def update_beam(self, examples, *, beam_width, drop=0.,
+                    sgd=None, losses=None, beam_density=0.0):
         raise NotImplementedError
 
     def set_output(self, nO):
@@ -684,9 +686,10 @@ class Parser(TrainablePipe):
             return states
 
         # Parse the states that are too long with the teacher's parsing model.
-        teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
-            states=[state.copy() for state in to_cut])
-        (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs)
+        teacher_inputs = TransitionModelInputs(docs=docs,
+                                               moves=moves,
+                                               states=[state.copy() for state in to_cut])
+        (teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)
 
         # Step through the teacher's actions and store every state after
         # each multiple of max_length.
@@ -784,6 +787,7 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
 
     return actions
 
+
 def _states_diff_to_actions(
     before_states: List[StateClass],
     after_states: List[StateClass]
@@ -804,8 +808,9 @@ def _states_diff_to_actions(
         c_state_before = before_state.c
         c_state_after = after_state.c
 
-        assert equal(c_state_before.history.begin(), c_state_before.history.end(),
-            c_state_after.history.begin())
+        assert equal(c_state_before.history.begin(),
+                     c_state_before.history.end(),
+                     c_state_after.history.begin())
 
     actions = []
     while True:
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 43826f07c44..28e06a2ecea 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,7 +1,6 @@
 # cython: infer_types=True
-from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Iterable, Iterator, List, Optional, Tuple, Union
 
-cimport cython
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from libcpp.set cimport set
@@ -244,7 +243,6 @@ cdef class StringStore:
         cdef int n_length_bytes
         cdef int i
         cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
-        cdef uint32_t ulength = length
         if length < sizeof(string.s):
             string.s[0] = <unsigned char>length
             memcpy(&string.s[1], chars, length)
@@ -302,7 +300,7 @@ cpdef hash_t get_string_id(object string_or_hash) except -1:
 
     try:
         return hash_string(string_or_hash)
-    except:
+    except:   # no-cython-lint
         if _try_coerce_to_hash(string_or_hash, &str_hash):
             # Coerce the integral key to the expected primitive hash type.
             # This ensures that custom/overloaded "primitive" data types
@@ -319,6 +317,5 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
     try:
         out_hash[0] = key
         return True
-    except:
+    except:  # no-cython-lint
         return False
-
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
index 0983159b75d..cd9e6b2f5ee 100644
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@@ -2,7 +2,7 @@
 from cymem.cymem cimport Pool
 
 from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
-from spacy.typedefs cimport class_t, weight_t
+from spacy.typedefs cimport class_t
 
 import pytest
 
@@ -42,32 +42,35 @@ cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
     state = <TestState*>state
     mem.free(state)
 
+
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width",
-    [
-        (2, 3),
-        (3, 6),
-        (4, 20),
-    ]
-)
+                         [
+                             (2, 3),
+                             (3, 6),
+                             (4, 20),
+                         ]
+                         )
 def test_init(nr_class, beam_width):
     b = Beam(nr_class, beam_width)
     assert b.size == 1
     assert b.width == beam_width
     assert b.nr_class == nr_class
 
+
 @cytest
 def test_init_violn():
     MaxViolation()
 
+
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length",
-    [
-        (2, 3, 3),
-        (3, 6, 15),
-        (4, 20, 32),
-    ]
-)
+                         [
+                             (2, 3, 3),
+                             (3, 6, 15),
+                             (4, 20, 32),
+                         ]
+                         )
 def test_initialize(nr_class, beam_width, length):
     b = Beam(nr_class, beam_width)
     b.initialize(initialize, destroy, length, NULL)
@@ -79,11 +82,11 @@ def test_initialize(nr_class, beam_width, length):
 
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length,extra",
-    [
-        (2, 3, 4, None),
-        (3, 6, 15, u"test beam 1"),
-    ]
-)
+                         [
+                             (2, 3, 4, None),
+                             (3, 6, 15, u"test beam 1"),
+                         ]
+                         )
 def test_initialize_extra(nr_class, beam_width, length, extra):
     b = Beam(nr_class, beam_width)
     if extra is None:
@@ -97,11 +100,11 @@ def test_initialize_extra(nr_class, beam_width, length, extra):
 
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length",
-    [
-        (3, 6, 15),
-        (4, 20, 32),
-    ]
-)
+                         [
+                             (3, 6, 15),
+                             (4, 20, 32),
+                         ]
+                         )
 def test_transition(nr_class, beam_width, length):
     b = Beam(nr_class, beam_width)
     b.initialize(initialize, destroy, length, NULL)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index ff1a0d310d1..4b8a15a65fd 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1758,7 +1758,7 @@ cdef class Doc:
                                     data["underscore_span"] = {}
                                 if attr not in data["underscore_span"]:
                                     data["underscore_span"][attr] = []
-                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id":_span_id})
+                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id": _span_id})
 
             for attr in underscore:
                 if attr not in user_keys:
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index 014c01a2f74..f3841baa24a 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -1,5 +1,4 @@
 cimport numpy as np
-from libc.string cimport memset
 
 from ..errors import Errors
 from ..morphology import Morphology
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index ed85bb40da9..332123ad774 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -242,8 +242,8 @@ cdef class Span:
 
     @property
     def _(self):
-        cdef SpanC* span_c = self.span_c()
         """Custom extension attributes registered via `set_extension`."""
+        cdef SpanC* span_c = self.span_c()
         return Underscore(Underscore.span_extensions, self,
                           start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
 
@@ -953,7 +953,6 @@ cdef class Span:
             self.id_ = ent_id_
 
 
-
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
     # better candidates

From d45b7ad9a8931d8c595faabfae435b42569d7397 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 Jul 2023 09:59:19 +0200
Subject: [PATCH 301/504] Update spacy/ml/tb_framework.pyx

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 spacy/ml/tb_framework.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index ed04045a6a5..a48c6b901c7 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -305,7 +305,8 @@ def _forward_fallback(
     seen_mask,
     is_train: bool,
     actions: Optional[List[Ints1d]] = None,
-        max_moves: int = 0):
+    max_moves: int = 0,
+):
     nF = model.get_dim("nF")
     output = model.get_ref("output")
     hidden_b = model.get_param("hidden_b")

From cd336fb3b582223ed5cc68a209d450977a1fd0a0 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 20 Jul 2023 14:08:29 +0200
Subject: [PATCH 302/504] remove unnecessary line

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/ml/tb_framework.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index a48c6b901c7..6c5c29d8549 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -145,7 +145,6 @@ def init(
         current_nO = model.maybe_get_dim("nO")
         if current_nO is None or current_nO != inferred_nO:
             model.attrs["resize_output"](model, inferred_nO)
-    # nO = model.get_dim("nO")
     nP = model.get_dim("nP")
     nH = model.get_dim("nH")
     nI = model.get_dim("nI")

From defcada0ef27bbdd0543e69b6b0995f328c16437 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 31 Jul 2023 15:54:35 +0200
Subject: [PATCH 303/504] Recommend lookups tables from URLs or other loaders
 (#12283)

* Recommend lookups tables from URLs or other loaders

Shift away from the `lookups` extra (which isn't removed, just no longer
mentioned) and recommend loading data from the `spacy-lookups-data` repo
or other sources rather than the `spacy-lookups-data` package.

If the tables can't be loaded from the `lookups` registry in the
lemmatizer, show how to specify the tables in `[initialize]` rather than
recommending the `spacy-lookups-data` package.

* Add tests for some rule-based lemmatizers

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/about.py                             |  4 ++
 spacy/errors.py                            | 25 ++++++++---
 spacy/language.py                          |  7 ----
 spacy/lookups.py                           | 26 +++++++++++-
 spacy/pipeline/lemmatizer.py               | 21 +++++++++-
 spacy/tests/pipeline/test_lemmatizer.py    | 16 ++++++-
 website/docs/api/lemmatizer.mdx            |  4 +-
 website/docs/api/top-level.mdx             | 49 ++++++++++++++++++++++
 website/docs/usage/index.mdx               |  7 ++--
 website/docs/usage/linguistic-features.mdx |  6 +--
 website/src/widgets/quickstart-install.js  |  4 --
 11 files changed, 141 insertions(+), 28 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index ec1dde7cae6..73f201af5fb 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -3,3 +3,7 @@
 __version__ = "4.0.0.dev1"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
+__projects__ = "https://github.com/explosion/projects"
+__projects_branch__ = "v3"
+__lookups_tag__ = "v1.0.3"
+__lookups_url__ = f"https://raw.githubusercontent.com/explosion/spacy-lookups-data/{__lookups_tag__}/spacy_lookups_data/data/"
diff --git a/spacy/errors.py b/spacy/errors.py
index 2ddaef19bca..adca5880283 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,6 +1,8 @@
 import warnings
 from typing import Literal
 
+from . import about
+
 
 class ErrorsWithCodes(type):
     def __getattribute__(self, code):
@@ -103,13 +105,14 @@ class Warnings(metaclass=ErrorsWithCodes):
             "table. This may degrade the performance of the model to some "
             "degree. If this is intentional or the language you're using "
             "doesn't have a normalization table, please ignore this warning. "
-            "If this is surprising, make sure you have the spacy-lookups-data "
-            "package installed and load the table in your config. The "
-            "languages with lexeme normalization tables are currently: "
-            "{langs}\n\nLoad the table in your config with:\n\n"
+            "If this is surprising, make sure you are loading the table in "
+            "your config. The languages with lexeme normalization tables are "
+            "currently: {langs}\n\nAn example of how to load a table in "
+            "your config :\n\n"
             "[initialize.lookups]\n"
-            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
+            "@misc = \"spacy.LookupsDataLoaderFromURL.v1\"\n"
             "lang = ${{nlp.lang}}\n"
+             f'url = "{about.__lookups_url__}"\n'
             "tables = [\"lexeme_norm\"]\n")
     W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
             "attribute or operator.")
@@ -990,6 +993,18 @@ class Errors(metaclass=ErrorsWithCodes):
              "{existing_value}.")
     E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
     E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
+    E4010 = ("Required lemmatizer table(s) {missing_tables} not found in "
+             "[initialize] or in registered lookups (spacy-lookups-data). An "
+             "example for how to load lemmatizer tables in [initialize]:\n\n"
+             "[initialize.components]\n\n"
+             "[initialize.components.{pipe_name}]\n\n"
+             "[initialize.components.{pipe_name}.lookups]\n"
+             '@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
+             "lang = ${{nlp.lang}}\n"
+             f'url = "{about.__lookups_url__}"\n'
+             "tables = {tables}\n"
+             "# or required tables only: tables = {required_tables}\n")
+    E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
diff --git a/spacy/language.py b/spacy/language.py
index 5b2652db53b..72d27c598cc 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -128,13 +128,6 @@ def tokenizer_factory(nlp: "Language") -> Tokenizer:
     return tokenizer_factory
 
 
-@registry.misc("spacy.LookupsDataLoader.v1")
-def load_lookups_data(lang, tables):
-    util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
-    lookups = load_lookups(lang=lang, tables=tables)
-    return lookups
-
-
 class Language:
     """A text-processing pipeline. Usually you'll load this once per process,
     and pass the instance around your application.
diff --git a/spacy/lookups.py b/spacy/lookups.py
index 1a2c44bfa1c..e2e92426f6a 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -2,16 +2,40 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
+import requests
 import srsly
 from preshed.bloom import BloomFilter
 
 from .errors import Errors
 from .strings import get_string_id
-from .util import SimpleFrozenDict, ensure_path, load_language_data, registry
+from .util import SimpleFrozenDict, ensure_path, load_language_data, logger, registry
 
 UNSET = object()
 
 
+@registry.misc("spacy.LookupsDataLoader.v1")
+def load_lookups_data(lang, tables):
+    logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
+    lookups = load_lookups(lang=lang, tables=tables)
+    return lookups
+
+
+@registry.misc("spacy.LookupsDataLoaderFromURL.v1")
+def load_lookups_data_from_url(lang, tables, url):
+    logger.debug(f"Loading lookups from {url}: {tables}")
+    lookups = Lookups()
+    for table in tables:
+        table_url = url + lang + "_" + table + ".json"
+        r = requests.get(table_url)
+        if r.status_code != 200:
+            raise ValueError(
+                Errors.E4011.format(status_code=r.status_code, url=table_url)
+            )
+        table_data = r.json()
+        lookups.add_table(table, table_data)
+    return lookups
+
+
 def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
     """Load the data from the spacy-lookups-data package for a given language,
     if available. Returns an empty `Lookups` container if there's no data or if the package
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 09e501595a8..ed9547c745b 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
+import srsly
 from thinc.api import Model
 
 from .. import util
@@ -155,8 +156,24 @@ def initialize(
         """
         required_tables, optional_tables = self.get_lookups_config(self.mode)
         if lookups is None:
-            logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
-            lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
+            logger.debug(
+                "Lemmatizer: no lemmatizer lookups tables provided, "
+                "trying to load tables from registered lookups (usually "
+                "spacy-lookups-data)"
+            )
+            lookups = load_lookups(
+                lang=self.vocab.lang, tables=required_tables, strict=False
+            )
+            missing_tables = set(required_tables) - set(lookups.tables)
+            if len(missing_tables) > 0:
+                raise ValueError(
+                    Errors.E4010.format(
+                        missing_tables=list(missing_tables),
+                        pipe_name=self.name,
+                        required_tables=srsly.json_dumps(required_tables),
+                        tables=srsly.json_dumps(required_tables + optional_tables),
+                    )
+                )
             optional_lookups = load_lookups(
                 lang=self.vocab.lang, tables=optional_tables, strict=False
             )
diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py
index ccc2e0b154a..5385fb5d7dd 100644
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@@ -2,9 +2,11 @@
 
 import pytest
 
+import spacy
 from spacy import registry, util
+from spacy.about import __lookups_url__
 from spacy.lang.en import English
-from spacy.lookups import Lookups
+from spacy.lookups import Lookups, load_lookups_data_from_url
 
 from ..util import make_tempdir
 
@@ -113,3 +115,15 @@ def cope_lookups():
 
     # Make sure that lemmatizer cache can be pickled
     pickle.dumps(lemmatizer2)
+
+
+@pytest.mark.parametrize("lang", ("ca", "en"))
+def test_lemmatizer_load_lookups_from_url(lang):
+    nlp = spacy.blank(lang)
+    lemmatizer = nlp.add_pipe("lemmatizer")
+    req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
+    lookups = load_lookups_data_from_url(
+        nlp.lang, req_tables + opt_tables, __lookups_url__
+    )
+    lemmatizer.initialize(lookups=lookups)
+    assert set(lemmatizer.lookups.tables) == set(req_tables + opt_tables)
diff --git a/website/docs/api/lemmatizer.mdx b/website/docs/api/lemmatizer.mdx
index f6657dbf48c..5bd0112e237 100644
--- a/website/docs/api/lemmatizer.mdx
+++ b/website/docs/api/lemmatizer.mdx
@@ -14,7 +14,7 @@ implement their own lemmatizer components via
 [language-specific factories](/usage/processing-pipelines#factories-language).
 The default data used is provided by the
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
-extension package.
+repository.
 
 For a trainable lemmatizer, see [`EditTreeLemmatizer`](/api/edittreelemmatizer).
 
@@ -174,6 +174,8 @@ training. At runtime, all data is loaded from disk.
 >
 > ```python
 > lemmatizer = nlp.add_pipe("lemmatizer")
+> req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode)
+> lookups = load_lookups(nlp.lang, req_tables + opt_tables)
 > lemmatizer.initialize(lookups=lookups)
 > ```
 >
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 8555d64ba63..a2d4bbdd387 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -9,6 +9,7 @@ menu:
   - ['Batchers', 'batchers']
   - ['Augmenters', 'augmenters']
   - ['Callbacks', 'callbacks']
+  - ['Miscellaneous', 'misc']
   - ['Training & Alignment', 'gold']
   - ['Utility Functions', 'util']
 ---
@@ -1058,6 +1059,54 @@ methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`,
 | `additional_pipe_functions` | Additional pipeline methods to wrap. Keys are pipeline names and values are lists of method identifiers. Defaults to `None`. ~~Optional[Dict[str, List[str]]]~~ |
 | **CREATES**                 | A function that takes the current `nlp` and wraps pipe models and methods in NVTX ranges. ~~Callable[[Language], Language]~~                                    |
 
+## Miscellaneous {id="misc"}
+
+### spacy.LookupsDataLoader.v1 {id="lookups_data_reader",tag="registered function",version="3"}
+
+> #### Example config
+>
+> ```ini
+> [initialize.lookups]
+> @misc = "spacy.LookupsDataLoader.v1"
+> lang = ${nlp.lang}
+> tables = ["lexeme_prob"]
+> ```
+
+Load the specified tables from the [`lookups` registry](#registry), which are
+provided by a package such as
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
+
+| Name        | Description                                                                                     |
+| ----------- | ----------------------------------------------------------------------------------------------- |
+| `lang`      | The language. ~~str~~                                                                           |
+| `tables`    | The tables to load. ~~List[str]~~                                                               |
+| **CREATES** | A function that loads the specified tables from the lookups registry. ~~Callable[[], Lookups]~~ |
+
+### spacy.LookupsDataLoaderFromURL.v1 {id="lookups_data_reader_from_url",tag="registered function",version="4"}
+
+> #### Example config
+>
+> ```ini
+> [initialize.components.lemmatizer.lookups]
+> @misc = "spacy.LookupsDataLoaderFromURL.v1"
+> lang = ${nlp.lang}
+> url = "https://raw.githubusercontent.com/explosion/spacy-lookups-data/v1.0.3/spacy_lookups_data/data/"
+> tables = ["lemma_rules","lemma_exc","lemma_index"]
+> ```
+
+Load the specified tables from the provided URL. The individual tables are
+expected to have filenames in the format `{lang}_{table}.json` under the
+specified URL directory as in the
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data/spacy_lookups_data/data/)
+repository.
+
+| Name        | Description                                                                                 |
+| ----------- | ------------------------------------------------------------------------------------------- |
+| `lang`      | The language. ~~str~~                                                                       |
+| `url`       | The URL for the directory where the tables can be downloaded. ~~str~~                       |
+| `tables`    | The tables to load. ~~List[str]~~                                                           |
+| **CREATES** | A function that loads the specified tables from the provided URL. ~~Callable[[], Lookups]~~ |
+
 ## Training data and alignment {id="gold",source="spacy/training"}
 
 ### training.offsets_to_biluo_tags {id="offsets_to_biluo_tags",tag="function"}
diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx
index b8b4917f2b2..6faad1d6a0f 100644
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@@ -59,19 +59,18 @@ $ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
 ```
 
 spaCy also lets you install extra dependencies by specifying the following
-keywords in brackets, e.g. `spacy[ja]` or `spacy[lookups,transformers]` (with
+keywords in brackets, e.g. `spacy[ja]` or `spacy[apple,transformers]` (with
 multiple comma-separated extras). See the `[options.extras_require]` section in
 spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
 
 > #### Example
 >
 > ```bash
-> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
+> $ pip install %%SPACY_PKG_NAME[apple,transformers]%%SPACY_PKG_FLAGS
 > ```
 
 | Name             | Description                                                                                                                                                                                                                                                    |
 | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lookups`        | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
 | `transformers`   | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline.                                                                                    |
 | `cuda`, ...      | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options.                                                                                |
 | `apple`          | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1.                                                                                                                                               |
@@ -174,7 +173,7 @@ $ pip install --no-build-isolation --editable . # compile and install spaCy
 To install with extras:
 
 ```bash
-$ pip install --no-build-isolation --editable .[lookups,cuda102]
+$ pip install --no-build-isolation --editable .[ja,cuda102]
 ```
 
 How to install compilers and related build tools:
diff --git a/website/docs/usage/linguistic-features.mdx b/website/docs/usage/linguistic-features.mdx
index 21cedd1ef2c..26d1ad37962 100644
--- a/website/docs/usage/linguistic-features.mdx
+++ b/website/docs/usage/linguistic-features.mdx
@@ -148,11 +148,11 @@ component.
 
 </Infobox>
 
-The data for spaCy's lemmatizers is distributed in the package
+The data for spaCy's lemmatizers is distributed in the repository
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The
 provided trained pipelines already include all the required tables, but if you
-are creating new pipelines, you'll probably want to install `spacy-lookups-data`
-to provide the data when the lemmatizer is initialized.
+are creating new pipelines, you can load data from the repository in the
+lemmatizer initialization.
 
 ### Lookup lemmatizer {id="lemmatizer-lookup"}
 
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 43e3a0eeb6c..f4e0a01e8ca 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -50,7 +50,6 @@ const QuickstartInstall = ({ id, title }) => {
     const pipExtras = [
         hardware === 'gpu' && (platform !== 'arm' || os === 'linux') && cuda,
         train && 'transformers',
-        train && 'lookups',
         apple && 'apple',
         ...modelExtras,
     ]
@@ -214,9 +213,6 @@ const QuickstartInstall = ({ id, title }) => {
             <QS config="train" package="conda" comment prompt={false}>
                 # packages only available via pip
             </QS>
-            <QS config="train" package="conda">
-                pip install spacy-lookups-data
-            </QS>
 
             {languages.map(({ code, models: modelOptions }) => {
                 const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]

From ef198a7987512f4529f3009077eb21572018784e Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 1 Aug 2023 22:24:02 +0900
Subject: [PATCH 304/504] Accept multiple code files in all CLI commands
 (#12101)

* Add support for multiple code files to all relevant commands

Prior to this, only the package command supported multiple code files.

* Update docs

* Add debug data test, plus generic fixtures

One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.

* Add debug config test and restructure

The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.

* Use a more generic, parametrized test

* Add output arg for assemble and pretrain

Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.

* Add evaluate test and some cleanup

* Mark tests as slow

* Revert argument name change

* Apply suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Format API CLI docs

* isort

* Fix imports in tests

* isort

* Undo changes to package CLI help

* Fix python executable and lang code in test

* Fix executable in another test

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 spacy/cli/_util.py          |   7 ++
 spacy/cli/assemble.py       |   6 +-
 spacy/cli/debug_config.py   |   6 +-
 spacy/cli/debug_data.py     |   6 +-
 spacy/cli/evaluate.py       |   6 +-
 spacy/cli/package.py        |   2 +-
 spacy/cli/pretrain.py       |   6 +-
 spacy/cli/train.py          |   6 +-
 spacy/tests/test_cli_app.py | 206 ++++++++++++++++++++++++++++++++++++
 website/docs/api/cli.mdx    | 108 +++++++++----------
 10 files changed, 286 insertions(+), 73 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index b005accf91f..eed61119070 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -348,6 +348,13 @@ def show_validation_error(
         msg.fail("Config validation error", e, exits=1)
 
 
+def import_code_paths(code_paths: str) -> None:
+    """Helper to import comma-separated list of code paths."""
+    code_paths = [Path(p.strip()) for p in string_to_list(code_paths)]
+    for code_path in code_paths:
+        import_code(code_path)
+
+
 def import_code(code_path: Optional[Union[Path, str]]) -> None:
     """Helper to import Python file provided in training commands / commands
     using the config. This makes custom registered functions available.
diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py
index f74bbacb555..7ad0f52fe1d 100644
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@@ -11,7 +11,7 @@
     Arg,
     Opt,
     app,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     show_validation_error,
 )
@@ -26,7 +26,7 @@ def assemble_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
     output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     # fmt: on
 ):
@@ -46,7 +46,7 @@ def assemble_cli(
     if not config_path or (str(config_path) != "-" and not config_path.exists()):
         msg.fail("Config file not found", config_path, exits=1)
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides, interpolate=False)
     msg.divider("Initializing pipeline")
diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 0e5382cd956..7818b4087e7 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -13,7 +13,7 @@
     Arg,
     Opt,
     debug_cli,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     show_validation_error,
 )
@@ -27,7 +27,7 @@ def debug_config_cli(
     # fmt: off
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
     show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
     # fmt: on
@@ -44,7 +44,7 @@ def debug_config_cli(
     DOCS: https://spacy.io/api/cli#debug-config
     """
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     debug_config(
         config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars
     )
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 4c44a8c0e2b..714969be145 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -40,7 +40,7 @@
     _format_number,
     app,
     debug_cli,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     show_validation_error,
 )
@@ -72,7 +72,7 @@ def debug_data_cli(
     # fmt: off
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
     verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
     no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
@@ -92,7 +92,7 @@ def debug_data_cli(
             "--help for an overview of the other available debugging commands."
         )
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     debug_data(
         config_path,
         config_overrides=overrides,
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 2276ca6b0d4..c3527028e9d 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -10,7 +10,7 @@
 from ..scorer import Scorer
 from ..tokens import Doc
 from ..training import Corpus
-from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
+from ._util import Arg, Opt, app, benchmark_cli, import_code_paths, setup_gpu
 
 
 @benchmark_cli.command(
@@ -22,7 +22,7 @@ def evaluate_cli(
     model: str = Arg(..., help="Model name or path"),
     data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
     output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
     gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
@@ -43,7 +43,7 @@ def evaluate_cli(
 
     DOCS: https://spacy.io/api/cli#benchmark-accuracy
     """
-    import_code(code_path)
+    import_code_paths(code_path)
     evaluate(
         model,
         data_path,
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 9421199f111..06b503271af 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -23,7 +23,7 @@ def package_cli(
     # fmt: off
     input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
     output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
-    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
+    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be included in the package"),
     meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
     create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
     name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 446c40510df..73337a7ca98 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -11,7 +11,7 @@
     Arg,
     Opt,
     app,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     setup_gpu,
     show_validation_error,
@@ -27,7 +27,7 @@ def pretrain_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
     output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
@@ -56,7 +56,7 @@ def pretrain_cli(
     DOCS: https://spacy.io/api/cli#pretrain
     """
     config_overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
     setup_gpu(use_gpu)
     msg.info(f"Loading config from: {config_path}")
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index c72e13b2681..40934f546e2 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -13,7 +13,7 @@
     Arg,
     Opt,
     app,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     setup_gpu,
     show_validation_error,
@@ -28,7 +28,7 @@ def train_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
     output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
     # fmt: on
@@ -50,7 +50,7 @@ def train_cli(
     if verbose:
         util.logger.setLevel(logging.DEBUG)
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
 
 
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 32ca639b37d..f9c1a9d6579 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -1,4 +1,5 @@
 import os
+import subprocess
 import sys
 from pathlib import Path
 
@@ -6,6 +7,7 @@
 import srsly
 from typer.testing import CliRunner
 
+import spacy
 from spacy.cli._util import app, get_git_version
 from spacy.tokens import Doc, DocBin
 
@@ -47,6 +49,210 @@ def test_convert_auto_conflict():
         assert len(out_files) == 0
 
 
+NOOP_CONFIG = """
+[paths]
+train = null
+dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+seed = 0
+gpu_allocator = null
+
+[nlp]
+lang = "mul"
+pipeline = ["noop", "noop2"]
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+batch_size = 1000
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+[components]
+
+[components.noop]
+factory = "noop"
+
+[components.noop2]
+factory = "noop2"
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[training]
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1600
+max_epochs = 0
+max_steps = 100
+eval_frequency = 200
+frozen_components = []
+annotating_components = []
+dev_corpus = "corpora.dev"
+
+train_corpus = "corpora.train"
+before_to_disk = null
+before_update = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
+"""
+
+
+@pytest.fixture
+def data_paths():
+    nlp = spacy.blank("mul")
+    doc = nlp("ok")
+    with make_tempdir() as tdir:
+        db = DocBin()
+        # debug data will *fail* if there aren't enough docs
+        for ii in range(100):
+            db.add(doc)
+        fpath = tdir / "data.spacy"
+        db.to_disk(fpath)
+
+        args = [
+            "--paths.train",
+            str(fpath),
+            "--paths.dev",
+            str(fpath),
+        ]
+        yield args
+
+
+@pytest.fixture
+def code_paths():
+    noop_base = """
+from spacy.language import Language
+
+@Language.component("{}")
+def noop(doc):
+    return doc
+"""
+
+    with make_tempdir() as temp_d:
+        # write code files to load
+        paths = []
+        for ff in ["noop", "noop2"]:
+            pyfile = temp_d / f"{ff}.py"
+            pyfile.write_text(noop_base.format(ff))
+            paths.append(pyfile)
+
+        args = ["--code", ",".join([str(pp) for pp in paths])]
+        yield args
+
+
+@pytest.fixture
+def noop_config():
+    with make_tempdir() as temp_d:
+        cfg = temp_d / "config.cfg"
+        cfg.write_text(NOOP_CONFIG)
+
+        yield cfg
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "cmd",
+    ["debug config", "debug data", "train", "assemble"],
+)
+def test_multi_code(cmd, code_paths, data_paths, noop_config):
+    # check that it fails without the code arg
+    cmd = cmd.split()
+    output = ["."] if cmd[0] == "assemble" else []
+    cmd = [sys.executable, "-m", "spacy"] + cmd
+    result = subprocess.run([*cmd, str(noop_config), *output, *data_paths])
+    assert result.returncode == 1
+
+    # check that it succeeds with the code arg
+    result = subprocess.run([*cmd, str(noop_config), *output, *data_paths, *code_paths])
+    assert result.returncode == 0
+
+
+@pytest.mark.slow
+def test_multi_code_evaluate(code_paths, data_paths, noop_config):
+    # Evaluation requires a model, not a config, so this works differently from
+    # the other commands.
+
+    # Train a model to evaluate
+    cmd = f"{sys.executable} -m spacy train {noop_config} -o model".split()
+    result = subprocess.run([*cmd, *data_paths, *code_paths])
+    assert result.returncode == 0
+
+    # now do the evaluation
+
+    eval_data = data_paths[-1]
+    cmd = f"{sys.executable} -m spacy evaluate model/model-best {eval_data}".split()
+
+    # check that it fails without the code arg
+    result = subprocess.run(cmd)
+    assert result.returncode == 1
+
+    # check that it succeeds with the code arg
+    result = subprocess.run([*cmd, *code_paths])
+    assert result.returncode == 0
+
+
 def test_benchmark_accuracy_alias():
     # Verify that the `evaluate` alias works correctly.
     result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 3f91e1ff71e..765bcb8c675 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -176,15 +176,15 @@ validation error with more details.
 $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 ```
 
-| Name                   | Description                                                                                                                                                                          |
-| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `base_path`            | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~                                                            |
-| `output_file`          | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~                                                   |
-| `--code`, `-c`         | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                    |
-| `--diff`, `-D`         | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                                                                        |
-| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**            | Complete and auto-filled config file for training.                                                                                                                                   |
+| Name                   | Description                                                                                                                                                                                            |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `base_path`            | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~                                                                              |
+| `output_file`          | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~                                                                     |
+| `--code`, `-c`         | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                                      |
+| `--diff`, `-D`         | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                                                                                          |
+| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
+| **CREATES**            | Complete and auto-filled config file for training.                                                                                                                                                     |
 
 ### init fill-curated-transformer {id="init-fill-curated-transformer",version="3.7",tag="command"}
 
@@ -266,7 +266,7 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `output_path`     | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~                                                                                                               |
-| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--code`, `-c`    | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--verbose`, `-V` | Show more detailed messages for debugging purposes. ~~bool (flag)~~                                                                                                                                                |
 | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
@@ -491,7 +491,7 @@ File       /path/to/thinc/thinc/schedules.py (line 91)
 | Name                     | Description                                                                                                                                                                                                                    |
 | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`            | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~             |
-| `--code`, `-c`           | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                                           |
+| `--code`, `-c`           | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                         |
 | `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~                                             |
 | `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ |
 | `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                     |
@@ -676,7 +676,7 @@ will not be available.
 | Name                       | Description                                                                                                                                                                                                        |
 | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`              | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
-| `--code`, `-c`             | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--code`, `-c`             | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~                                                                                                                                                       |
 | `--verbose`, `-V`          | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                     |
 | `--no-format`, `-NF`       | Don't pretty-print the results. Use this if you want to write to a file. ~~bool (flag)~~                                                                                                                           |
@@ -1136,7 +1136,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                          |
-| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--code`, `-c`    | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                                                       |
 | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
@@ -1206,6 +1206,7 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
 | -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`                                      | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `output_dir`                                       | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                                                           |
+| `--code`, `-c`                                     | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--code`, `-c`                                     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
 | `--resume-path`, `-r`                              | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                                                          |
 | `--epoch-resume`, `-er`                            | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                                                |
@@ -1243,20 +1244,19 @@ skew. To render a sample of dependency parses in a HTML file using the
 $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] [--per-component] [--spans-key]
 ```
 
-| Name                                                 | Description                                                                                                                                                                          |
-| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                                              | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`                                          | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                            |
-| `--output`, `-o`                                     | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                                  |
-| `--code`, `-c` <Tag variant="new">3</Tag>            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gold-preproc`, `-G`                               | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--gpu-id`, `-g`                                     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                           |
-| `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~            |
-| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                           |
-| `--spans-key`, `-sk` <Tag variant="new">3.6.2</Tag>  | Spans key to use when evaluating `Doc.spans`. Defaults to `sc`. ~~str (option)~~                                                                                                     |
-| `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                            |
+| Name                                                 | Description                                                                                                                                                                                            |
+| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                              | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                             |
+| `data_path`                                          | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                                              |
+| `--output`, `-o`                                     | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                                                    |
+| `--code`, `-c` <Tag variant="new">3</Tag>            | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gold-preproc`, `-G`                               | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                                                |
+| `--gpu-id`, `-g`                                     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
+| `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                                             |
+| `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~                              |
+| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                                             |
+| `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
+| **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                                              |
 
 ### speed {id="benchmark-speed", version="3.5", tag="command"}
 
@@ -1302,19 +1302,19 @@ If you want to evaluate the pipeline on raw text only, make sure that the .spacy
 $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
 ```
 
-| Name                      | Description                                                                                                                                                                          |
-| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  |
-| `data_path`               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 |
-| `output-file`             | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
-| `--code`, `-c`            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--text-key`, `-tk`       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            |
-| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    |
-| `--gpu-id`, `-g`          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--batch-size`, `-b`      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                  |
-| `--n-process`, `-n`       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                         |
-| `--help`, `-h`            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             |
+| Name                      | Description                                                                                                                                                                                            |
+| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                    |
+| `data_path`               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                                   |
+| `output-file`             | Output `DocBin` path. ~~str (positional)~~                                                                                                                                                             |
+| `--code`, `-c`            | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--text-key`, `-tk`       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                                              |
+| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                                      |
+| `--gpu-id`, `-g`          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
+| `--batch-size`, `-b`      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                                    |
+| `--n-process`, `-n`       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                           |
+| `--help`, `-h`            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
+| **CREATES**               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                                               |
 
 ## find-threshold {id="find-threshold",version="3.5",tag="command"}
 
@@ -1341,19 +1341,19 @@ be provided.
 > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
 > ```
 
-| Name                    | Description                                                                                                                                                                          |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                 | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`             | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
-| `pipe_name`             | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
-| `threshold_key`         | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
-| `scores_key`            | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
-| `--n_trials`, `-n`      | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
-| `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gpu-id`, `-g`        | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--gold-preproc`, `-G`  | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+| Name                     | Description                                                                                                                                                                                            |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                  | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                             |
+| `data_path`              | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                                  |
+| `pipe_name`              | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                                           |
+| `threshold_key`          | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                                          |
+| `scores_key`             | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                                              |
+| `--n_trials`, `-n`       | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                                     |
+| `--code`, `-c`           | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gpu-id`, `-g`         | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
+| `--gold-preproc`, `-G`   | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                                                |
+| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                                       |
+| `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
 
 ## assemble {#assemble tag="command"}
 
@@ -1377,7 +1377,7 @@ $ python -m spacy assemble [config_path] [output_dir] [--code] [--verbose] [over
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `config_path`     | Path to the [config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `output_dir`      | Directory to store the final pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                   |
-| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions). ~~Optional[Path] \(option)~~                                                |
+| `--code`, `-c`    | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~        |
 | `--verbose`, `-V` | Show more detailed messages during processing. ~~bool (flag)~~                                                                                                                                                |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                    |
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.data ./data`. ~~Any (option/flag)~~                            |

From 0a41b56b25d46c95f982916d7af8f908b2cbabaf Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 9 Aug 2023 10:55:52 +0200
Subject: [PATCH 305/504] Switch zh tokenizer default pkuseg_model to
 spacy_ontonotes (#12896)

So that users can use `copy_from_base_model` for other segmenters
without having to override an irrelevant `pkuseg_model` setting, switch
the default `pkuseg_model` to `spacy_ontonotes`.
---
 spacy/lang/zh/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index f7bb092771c..6b980b52b61 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -31,7 +31,7 @@
 [initialize]
 
 [initialize.tokenizer]
-pkuseg_model = null
+pkuseg_model = "spacy_ontonotes"
 pkuseg_user_dict = "default"
 """
 

From fce12dbaf78ce9ed92a07cdd648e5bd39321ff85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 14:38:05 +0100
Subject: [PATCH 306/504] Revert "Reimplement distillation with oracle cut size
 (#12214)"

This reverts commit e27c60a70263f7ab17968964de37e938653e37a2.
---
 spacy/ml/tb_framework.pyx            |   4 +-
 spacy/pipeline/transition_parser.pyx | 105 ++++++---------------------
 spacy/tests/parser/test_model.py     |  61 ----------------
 spacy/tests/parser/test_ner.py       |   5 +-
 spacy/tests/parser/test_parse.py     |   5 +-
 5 files changed, 24 insertions(+), 156 deletions(-)
 delete mode 100644 spacy/tests/parser/test_model.py

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 6c5c29d8549..e497643f0cd 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -267,11 +267,9 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
     cdef np.ndarray step_actions
 
     scores = []
-    while sizes.states >= 1 and (actions is None or len(actions) > 0):
+    while sizes.states >= 1:
         step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
         step_actions = actions[0] if actions is not None else None
-        assert step_actions is None or step_actions.size == sizes.states, \
-            f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
         with nogil:
             _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
             if actions is None:
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 9fa0d4987b8..99970b3fe93 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -49,10 +49,6 @@ cdef extern from "<algorithm>" namespace "std" nogil:
 
 
 
-# TODO: Remove when we switch to Cython 3.
-cdef extern from "<algorithm>" namespace "std" nogil:
-    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
-
 NUMPY_OPS = NumpyOps()
 
 
@@ -271,8 +267,8 @@ class Parser(TrainablePipe):
             # batch uniform length. Since we do not have a gold standard
             # sequence, we use the teacher's predictions as the gold
             # standard.
-            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
-            states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves)
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            states = self._init_batch(teacher_pipe, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
@@ -283,14 +279,12 @@ class Parser(TrainablePipe):
         # gradients of the student's transition distributions relative to the
         # teacher's distributions.
 
-        student_inputs = TransitionModelInputs(docs=student_docs,
-                                               states=[state.copy() for state in states],
-                                               moves=self.moves,
-                                               max_moves=max_moves)
+        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
+            max_moves=max_moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = _states_diff_to_actions(states, student_states)
+        actions = states2actions(student_states)
         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-                                               states=states, moves=teacher_pipe.moves, actions=actions)
+            moves=self.moves, actions=actions)
         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@@ -538,7 +532,7 @@ class Parser(TrainablePipe):
         set_dropout_rate(self.model, 0.0)
         student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = _states_to_actions(student_states)
+        actions = states2actions(student_states)
         teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
         _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
 
@@ -658,7 +652,7 @@ class Parser(TrainablePipe):
                     raise ValueError(Errors.E149) from None
         return self
 
-    def _init_batch_from_teacher(self, teacher_pipe, docs, max_length):
+    def _init_batch(self, teacher_step_model, docs, max_length):
         """Make a square batch of length equal to the shortest transition
         sequence or a cap. A long
         doc will get multiple states. Let's say we have a doc of length 2*N,
@@ -667,12 +661,10 @@ class Parser(TrainablePipe):
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
         cdef:
+            StateClass start_state
             StateClass state
-            TransitionSystem moves = teacher_pipe.moves
-
-        # Start with the same heuristic as in supervised training: exclude
-        # docs that are within the maximum length.
-        all_states = moves.init_batch(docs)
+            Transition action
+        all_states = self.moves.init_batch(docs)
         states = []
         to_cut = []
         for state, doc in zip(all_states, docs):
@@ -681,30 +673,19 @@ class Parser(TrainablePipe):
                     states.append(state)
                 else:
                     to_cut.append(state)
-
-        if not to_cut:
-            return states
-
-        # Parse the states that are too long with the teacher's parsing model.
-        teacher_inputs = TransitionModelInputs(docs=docs,
-                                               moves=moves,
-                                               states=[state.copy() for state in to_cut])
-        (teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)
-
-        # Step through the teacher's actions and store every state after
-        # each multiple of max_length.
-        teacher_actions = _states_to_actions(teacher_states)
         while to_cut:
             states.extend(state.copy() for state in to_cut)
-            for step_actions in teacher_actions[:max_length]:
-                to_cut = moves.apply_actions(to_cut, step_actions)
-            teacher_actions = teacher_actions[max_length:]
-
-            if len(teacher_actions) < max_length:
-                break
-
+            # Move states forward max_length actions.
+            length = 0
+            while to_cut and length < max_length:
+                teacher_scores = teacher_step_model.predict(to_cut)
+                self.transition_states(to_cut, teacher_scores)
+                # States that are completed do not need further cutting.
+                to_cut = [state for state in to_cut if not state.is_final()]
+                length += 1
         return states
 
+
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
         sequence or a cap. A long doc will get multiple states. Let's say we
@@ -765,7 +746,7 @@ def _change_attrs(model, **kwargs):
             model.attrs[key] = value
 
 
-def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
+def states2actions(states: List[StateClass]) -> List[Ints1d]:
     cdef int step
     cdef StateClass state
     cdef StateC* c_state
@@ -786,47 +767,3 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
         actions.append(numpy.array(step_actions, dtype="i"))
 
     return actions
-
-
-def _states_diff_to_actions(
-    before_states: List[StateClass],
-    after_states: List[StateClass]
-) -> List[Ints1d]:
-    """
-    Return for two sets of states the actions to go from the first set of
-    states to the second set of states. The histories of the first set of
-    states must be a prefix of the second set of states.
-    """
-    cdef StateClass before_state, after_state
-    cdef StateC* c_state_before
-    cdef StateC* c_state_after
-
-    assert len(before_states) == len(after_states)
-
-    # Check invariant: before states histories must be prefixes of after states.
-    for before_state, after_state in zip(before_states, after_states):
-        c_state_before = before_state.c
-        c_state_after = after_state.c
-
-        assert equal(c_state_before.history.begin(),
-                     c_state_before.history.end(),
-                     c_state_after.history.begin())
-
-    actions = []
-    while True:
-        step = len(actions)
-
-        step_actions = []
-        for before_state, after_state in zip(before_states, after_states):
-            c_state_before = before_state.c
-            c_state_after = after_state.c
-            if step < c_state_after.history.size() - c_state_before.history.size():
-                step_actions.append(c_state_after.history[c_state_before.history.size() + step])
-
-        # We are done if we have exhausted all histories.
-        if len(step_actions) == 0:
-            break
-
-        actions.append(numpy.array(step_actions, dtype="i"))
-
-    return actions
diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py
deleted file mode 100644
index 8c1cf7a9346..00000000000
--- a/spacy/tests/parser/test_model.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import numpy
-import pytest
-
-from spacy.lang.en import English
-from spacy.ml.tb_framework import TransitionModelInputs
-from spacy.training import Example
-
-TRAIN_DATA = [
-    (
-        "They trade mortgage-backed securities.",
-        {
-            "heads": [1, 1, 4, 4, 5, 1, 1],
-            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
-        },
-    ),
-    (
-        "I like London and Berlin.",
-        {
-            "heads": [1, 1, 1, 2, 2, 1],
-            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
-        },
-    ),
-]
-
-
-@pytest.fixture
-def nlp_parser():
-    nlp = English()
-    parser = nlp.add_pipe("parser")
-
-    train_examples = []
-    for text, annotations in TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-        for dep in annotations["deps"]:
-            parser.add_label(dep)
-    nlp.initialize()
-
-    return nlp, parser
-
-
-def test_incorrect_number_of_actions(nlp_parser):
-    nlp, parser = nlp_parser
-    doc = nlp.make_doc("test")
-
-    # Too many actions for the number of docs
-    with pytest.raises(AssertionError):
-        parser.model.predict(
-            TransitionModelInputs(
-                docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
-            )
-        )
-
-    # Too few actions for the number of docs
-    with pytest.raises(AssertionError):
-        parser.model.predict(
-            TransitionModelInputs(
-                docs=[doc, doc],
-                moves=parser.moves,
-                actions=[numpy.array([0], dtype="i")],
-            )
-        )
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index f0efc3a63fd..bb9b7653ce3 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -623,9 +623,7 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-@pytest.mark.slow
-@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
-def test_distill(max_moves):
+def test_distill():
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -643,7 +641,6 @@ def test_distill(max_moves):
 
     student = English()
     student_ner = student.add_pipe("ner")
-    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 636bb887789..d25eb165acb 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -462,9 +462,7 @@ def test_is_distillable():
     assert parser.is_distillable
 
 
-@pytest.mark.slow
-@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
-def test_distill(max_moves):
+def test_distill():
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []
@@ -482,7 +480,6 @@ def test_distill(max_moves):
 
     student = English()
     student_parser = student.add_pipe("parser")
-    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
     student_parser.initialize(
         get_examples=lambda: train_examples, labels=teacher_parser.label_data
     )

From de327f7ee754da7af11f266ec2119bbdd9f6a219 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:23:08 +0100
Subject: [PATCH 307/504] Revert "Merge the parser refactor into `v4` (#10940)"

This reverts commit a183db3cefe0713e828868b43daf61c464cae05c.
---
 setup.py                                      |   6 +-
 spacy/cli/templates/quickstart_training.jinja |  12 +-
 spacy/compat.py                               |   5 +
 spacy/errors.py                               |   7 +-
 spacy/ml/_precomputable_affine.py             | 164 +++++
 spacy/ml/models/parser.py                     | 177 +++--
 spacy/ml/parser_model.pxd                     |  49 ++
 spacy/ml/parser_model.pyx                     | 500 ++++++++++++++
 spacy/ml/tb_framework.pxd                     |  28 -
 spacy/ml/tb_framework.py                      |  50 ++
 spacy/ml/tb_framework.pyx                     | 639 ------------------
 .../_parser_internals/_beam_utils.pyx         |   4 +-
 .../_parser_internals/_parser_utils.pxd       |   2 -
 .../_parser_internals/_parser_utils.pyx       |  22 -
 spacy/pipeline/_parser_internals/_state.pxd   |  60 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 -
 spacy/pipeline/_parser_internals/batch.pxd    |   2 -
 spacy/pipeline/_parser_internals/batch.pyx    |  52 --
 spacy/pipeline/_parser_internals/ner.pyx      |   6 +-
 .../pipeline/_parser_internals/stateclass.pyx |   7 -
 .../_parser_internals/transition_system.pxd   |   7 -
 .../_parser_internals/transition_system.pyx   |  73 --
 .../{dep_parser.py => dep_parser.pyx}         |  20 +-
 spacy/pipeline/{ner.py => ner.pyx}            |  35 +-
 spacy/pipeline/transition_parser.pxd          |  21 +
 spacy/pipeline/transition_parser.pyx          | 504 ++++++++------
 spacy/tests/parser/test_ner.py                |  10 +-
 spacy/tests/parser/test_parse.py              |  74 +-
 spacy/tests/pipeline/test_tok2vec.py          |   2 +-
 .../tests/serialize/test_serialize_config.py  |  56 +-
 spacy/tests/test_misc.py                      |  55 +-
 spacy/training/example.pyx                    |   2 +-
 website/docs/api/architectures.mdx            |  26 +-
 website/docs/api/cli.mdx                      |   8 +-
 website/docs/api/legacy.mdx                   |   2 +-
 .../docs/usage/embeddings-transformers.mdx    |   6 +-
 36 files changed, 1384 insertions(+), 1312 deletions(-)
 create mode 100644 spacy/ml/_precomputable_affine.py
 create mode 100644 spacy/ml/parser_model.pxd
 create mode 100644 spacy/ml/parser_model.pyx
 delete mode 100644 spacy/ml/tb_framework.pxd
 create mode 100644 spacy/ml/tb_framework.py
 delete mode 100644 spacy/ml/tb_framework.pyx
 delete mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pxd
 delete mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pyx
 delete mode 100644 spacy/pipeline/_parser_internals/batch.pxd
 delete mode 100644 spacy/pipeline/_parser_internals/batch.pyx
 rename spacy/pipeline/{dep_parser.py => dep_parser.pyx} (96%)
 rename spacy/pipeline/{ner.py => ner.pyx} (93%)
 create mode 100644 spacy/pipeline/transition_parser.pxd

diff --git a/setup.py b/setup.py
index a4a87d68aec..0eb529c2098 100755
--- a/setup.py
+++ b/setup.py
@@ -32,10 +32,12 @@
     "spacy.kb.candidate",
     "spacy.kb.kb",
     "spacy.kb.kb_in_memory",
-    "spacy.ml.tb_framework",
+    "spacy.ml.parser_model",
     "spacy.morphology",
+    "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
+    "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
     "spacy.pipeline.sentencizer",
@@ -43,7 +45,6 @@
     "spacy.pipeline.tagger",
     "spacy.pipeline.transition_parser",
     "spacy.pipeline._parser_internals.arc_eager",
-    "spacy.pipeline._parser_internals.batch",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
     "spacy.pipeline._parser_internals.search",
@@ -51,7 +52,6 @@
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.pipeline._parser_internals._beam_utils",
-    "spacy.pipeline._parser_internals._parser_utils",
     "spacy.tokenizer",
     "spacy.training.align",
     "spacy.training.gold_io",
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 36325711d4d..2817147f3e9 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -90,11 +90,12 @@ grad_factor = 1.0
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
+use_upper = false
 nO = null
 
 [components.parser.model.tok2vec]
@@ -110,11 +111,12 @@ grad_factor = 1.0
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = false
 nO = null
 
 [components.ner.model.tok2vec]
@@ -385,11 +387,12 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
+use_upper = true
 nO = null
 
 [components.parser.model.tok2vec]
@@ -402,11 +405,12 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
diff --git a/spacy/compat.py b/spacy/compat.py
index 1e63807a0e8..30459e2e495 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,6 +23,11 @@
 except ImportError:
     cupy = None
 
+if sys.version_info[:2] >= (3, 8):  # Python 3.8+
+    from typing import Literal, Protocol, runtime_checkable
+else:
+    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
+
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/errors.py b/spacy/errors.py
index adca5880283..a5d0b3d11a9 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -217,12 +217,6 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
-    # v4 warning strings
-    W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
-    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
-            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
-            "to return `True` in `.supports_prior_probs`.")
-
 
 class Errors(metaclass=ErrorsWithCodes):
     E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
@@ -1007,6 +1001,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
+
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
 # fmt: on
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
new file mode 100644
index 00000000000..1c20c622b2c
--- /dev/null
+++ b/spacy/ml/_precomputable_affine.py
@@ -0,0 +1,164 @@
+from thinc.api import Model, normal_init
+
+from ..util import registry
+
+
+@registry.layers("spacy.PrecomputableAffine.v1")
+def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
+    model = Model(
+        "precomputable_affine",
+        forward,
+        init=init,
+        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
+        params={"W": None, "b": None, "pad": None},
+        attrs={"dropout_rate": dropout},
+    )
+    return model
+
+
+def forward(model, X, is_train):
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.get_param("W")
+    # Preallocate array for layer output, including padding.
+    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
+    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
+    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
+
+    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
+    # change its shape to (nF, nO, nP) without breaking existing models. So
+    # we'll squeeze the first dimension here.
+    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
+
+    def backward(dY_ids):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nO, nP), and get back:
+        # (nB, nO, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nO, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        assert dY.ndim == 3
+        assert dY.shape[1] == nO, dY.shape
+        assert dY.shape[2] == nP, dY.shape
+        # nB = dY.shape[0]
+        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
+        Xf = X[ids]
+        Xf = Xf.reshape((Xf.shape[0], nF * nI))
+
+        model.inc_grad("b", dY.sum(axis=0))
+        dY = dY.reshape((dY.shape[0], nO * nP))
+
+        Wopfi = W.transpose((1, 2, 0, 3))
+        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
+        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
+
+        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
+        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
+        # (o, p, f, i) --> (f, o, p, i)
+        dWopfi = dWopfi.transpose((2, 0, 1, 3))
+        model.inc_grad("W", dWopfi)
+        return dXf.reshape((dXf.shape[0], nF, nI))
+
+    return Yf, backward
+
+
+def _backprop_precomputable_affine_padding(model, dY, ids):
+    nB = dY.shape[0]
+    nF = model.get_dim("nF")
+    nP = model.get_dim("nP")
+    nO = model.get_dim("nO")
+    # Backprop the "padding", used as a filler for missing values.
+    # Values that are missing are set to -1, and each state vector could
+    # have multiple missing values. The padding has different values for
+    # different missing features. The gradient of the padding vector is:
+    #
+    # for b in range(nB):
+    #     for f in range(nF):
+    #         if ids[b, f] < 0:
+    #             d_pad[f] += dY[b]
+    #
+    # Which can be rewritten as:
+    #
+    # (ids < 0).T @ dY
+    mask = model.ops.asarray(ids < 0, dtype="f")
+    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
+    return d_pad.reshape((1, nF, nO, nP))
+
+
+def init(model, X=None, Y=None):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    if model.has_param("W") and model.get_param("W").any():
+        return
+
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nO, nP, nI)
+    b = model.ops.alloc2f(nO, nP)
+    pad = model.ops.alloc4f(1, nF, nO, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
+        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors = vectors.reshape((vectors.shape[0], nO, nP))
+        vectors += b
+        vectors = model.ops.asarray(vectors)
+        if nP >= 2:
+            return model.ops.maxout(vectors)[0]
+        else:
+            return vectors * (vectors >= 0)
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = model.get_param("W").copy()
+    b = model.get_param("b").copy()
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("b", b)
+        else:
+            break
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 422abf4e260..a70d84dea8f 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,66 +1,23 @@
-import warnings
-from typing import Any, List, Literal, Optional, Tuple
-
-from thinc.api import Model
+from typing import Optional, List, cast
+from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
 from thinc.types import Floats2d
 
-from ...errors import Errors, Warnings
-from ...tokens.doc import Doc
+from ...errors import Errors
+from ...compat import Literal
 from ...util import registry
+from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
-
-TransitionSystem = Any  # TODO
-State = Any  # TODO
-
-
-@registry.architectures.register("spacy.TransitionBasedParser.v2")
-def transition_parser_v2(
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    state_type: Literal["parser", "ner"],
-    extra_state_tokens: bool,
-    hidden_width: int,
-    maxout_pieces: int,
-    use_upper: bool,
-    nO: Optional[int] = None,
-) -> Model:
-    if not use_upper:
-        warnings.warn(Warnings.W400)
-
-    return build_tb_parser_model(
-        tok2vec,
-        state_type,
-        extra_state_tokens,
-        hidden_width,
-        maxout_pieces,
-        nO=nO,
-    )
-
-
-@registry.architectures.register("spacy.TransitionBasedParser.v3")
-def transition_parser_v3(
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    state_type: Literal["parser", "ner"],
-    extra_state_tokens: bool,
-    hidden_width: int,
-    maxout_pieces: int,
-    nO: Optional[int] = None,
-) -> Model:
-    return build_tb_parser_model(
-        tok2vec,
-        state_type,
-        extra_state_tokens,
-        hidden_width,
-        maxout_pieces,
-        nO=nO,
-    )
+from ...tokens import Doc
 
 
+@registry.architectures("spacy.TransitionBasedParser.v2")
 def build_tb_parser_model(
     tok2vec: Model[List[Doc], List[Floats2d]],
     state_type: Literal["parser", "ner"],
     extra_state_tokens: bool,
     hidden_width: int,
     maxout_pieces: int,
+    use_upper: bool,
     nO: Optional[int] = None,
 ) -> Model:
     """
@@ -94,7 +51,14 @@ def build_tb_parser_model(
         feature sets (for the NER) or 13 (for the parser).
     hidden_width (int): The width of the hidden layer.
     maxout_pieces (int): How many pieces to use in the state prediction layer.
-        Recommended values are 1, 2 or 3.
+        Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
+        is replaced with a ReLu non-linearity if use_upper=True, and no
+        non-linearity if use_upper=False.
+    use_upper (bool): Whether to use an additional hidden layer after the state
+        vector in order to predict the action scores. It is recommended to set
+        this to False for large pretrained models such as transformers, and True
+        for smaller networks. The upper layer is computed on CPU, which becomes
+        a bottleneck on larger GPU-based models, where it's also less necessary.
     nO (int or None): The number of actions the model will predict between.
         Usually inferred from data at the beginning of training, or loaded from
         disk.
@@ -105,11 +69,106 @@ def build_tb_parser_model(
         nr_feature_tokens = 6 if extra_state_tokens else 3
     else:
         raise ValueError(Errors.E917.format(value=state_type))
-    return TransitionModel(
-        tok2vec=tok2vec,
-        state_tokens=nr_feature_tokens,
-        hidden_width=hidden_width,
-        maxout_pieces=maxout_pieces,
-        nO=nO,
-        unseen_classes=set(),
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+    tok2vec = chain(
+        tok2vec,
+        list2array(),
+        Linear(hidden_width, t2v_width),
     )
+    tok2vec.set_dim("nO", hidden_width)
+    lower = _define_lower(
+        nO=hidden_width if use_upper else nO,
+        nF=nr_feature_tokens,
+        nI=tok2vec.get_dim("nO"),
+        nP=maxout_pieces,
+    )
+    upper = None
+    if use_upper:
+        with use_ops("cpu"):
+            # Initialize weights at zero, as it's a classification layer.
+            upper = _define_upper(nO=nO, nI=None)
+    return TransitionModel(tok2vec, lower, upper, resize_output)
+
+
+def _define_upper(nO, nI):
+    return Linear(nO=nO, nI=nI, init_W=zero_init)
+
+
+def _define_lower(nO, nF, nI, nP):
+    return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
+
+
+def resize_output(model, new_nO):
+    if model.attrs["has_upper"]:
+        return _resize_upper(model, new_nO)
+    return _resize_lower(model, new_nO)
+
+
+def _resize_upper(model, new_nO):
+    upper = model.get_ref("upper")
+    if upper.has_dim("nO") is None:
+        upper.set_dim("nO", new_nO)
+        return model
+    elif new_nO == upper.get_dim("nO"):
+        return model
+
+    smaller = upper
+    nI = smaller.maybe_get_dim("nI")
+    with use_ops("cpu"):
+        larger = _define_upper(nO=new_nO, nI=nI)
+    # it could be that the model is not initialized yet, then skip this bit
+    if smaller.has_param("W"):
+        larger_W = larger.ops.alloc2f(new_nO, nI)
+        larger_b = larger.ops.alloc1f(new_nO)
+        smaller_W = smaller.get_param("W")
+        smaller_b = smaller.get_param("b")
+        # Weights are stored in (nr_out, nr_in) format, so we're basically
+        # just adding rows here.
+        if smaller.has_dim("nO"):
+            old_nO = smaller.get_dim("nO")
+            larger_W[:old_nO] = smaller_W
+            larger_b[:old_nO] = smaller_b
+            for i in range(old_nO, new_nO):
+                model.attrs["unseen_classes"].add(i)
+
+        larger.set_param("W", larger_W)
+        larger.set_param("b", larger_b)
+    model._layers[-1] = larger
+    model.set_ref("upper", larger)
+    return model
+
+
+def _resize_lower(model, new_nO):
+    lower = model.get_ref("lower")
+    if lower.has_dim("nO") is None:
+        lower.set_dim("nO", new_nO)
+        return model
+
+    smaller = lower
+    nI = smaller.maybe_get_dim("nI")
+    nF = smaller.maybe_get_dim("nF")
+    nP = smaller.maybe_get_dim("nP")
+    larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
+    # it could be that the model is not initialized yet, then skip this bit
+    if smaller.has_param("W"):
+        larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
+        larger_b = larger.ops.alloc2f(new_nO, nP)
+        larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
+        smaller_W = smaller.get_param("W")
+        smaller_b = smaller.get_param("b")
+        smaller_pad = smaller.get_param("pad")
+        # Copy the old weights and padding into the new layer
+        if smaller.has_dim("nO"):
+            old_nO = smaller.get_dim("nO")
+            larger_W[:, 0:old_nO, :, :] = smaller_W
+            larger_pad[:, :, 0:old_nO, :] = smaller_pad
+            larger_b[0:old_nO, :] = smaller_b
+            for i in range(old_nO, new_nO):
+                model.attrs["unseen_classes"].add(i)
+
+        larger.set_param("W", larger_W)
+        larger.set_param("b", larger_b)
+        larger.set_param("pad", larger_pad)
+    model._layers[1] = larger
+    model.set_ref("lower", larger)
+    return model
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
new file mode 100644
index 00000000000..8def6cea53f
--- /dev/null
+++ b/spacy/ml/parser_model.pxd
@@ -0,0 +1,49 @@
+from libc.string cimport memset, memcpy
+from thinc.backends.cblas cimport CBlas
+from ..typedefs cimport weight_t, hash_t
+from ..pipeline._parser_internals._state cimport StateC
+
+
+cdef struct SizesC:
+    int states
+    int classes
+    int hiddens
+    int pieces
+    int feats
+    int embed_width
+
+
+cdef struct WeightsC:
+    const float* feat_weights
+    const float* feat_bias
+    const float* hidden_bias
+    const float* hidden_weights
+    const float* seen_classes
+
+
+cdef struct ActivationsC:
+    int* token_ids
+    float* unmaxed
+    float* scores
+    float* hiddens
+    int* is_valid
+    int _curr_size
+    int _max_size
+
+
+cdef WeightsC get_c_weights(model) except *
+
+cdef SizesC get_c_sizes(model, int batch_size) except *
+
+cdef ActivationsC alloc_activations(SizesC n) nogil
+
+cdef void free_activations(const ActivationsC* A) nogil
+
+cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
+        const WeightsC* W, SizesC n) nogil
+ 
+cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
+
+cdef void cpu_log_loss(float* d_scores,
+        const float* costs, const int* is_valid, const float* scores, int O) nogil
+ 
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
new file mode 100644
index 00000000000..91558683b60
--- /dev/null
+++ b/spacy/ml/parser_model.pyx
@@ -0,0 +1,500 @@
+# cython: infer_types=True, cdivision=True, boundscheck=False
+cimport numpy as np
+from libc.math cimport exp
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from thinc.backends.cblas cimport saxpy, sgemm
+
+import numpy
+import numpy.random
+from thinc.api import Model, CupyOps, NumpyOps, get_ops
+
+from .. import util
+from ..errors import Errors
+from ..typedefs cimport weight_t, class_t, hash_t
+from ..pipeline._parser_internals.stateclass cimport StateClass
+
+
+cdef WeightsC get_c_weights(model) except *:
+    cdef WeightsC output
+    cdef precompute_hiddens state2vec = model.state2vec
+    output.feat_weights = state2vec.get_feat_weights()
+    output.feat_bias = <const float*>state2vec.bias.data
+    cdef np.ndarray vec2scores_W
+    cdef np.ndarray vec2scores_b
+    if model.vec2scores is None:
+        output.hidden_weights = NULL
+        output.hidden_bias = NULL
+    else:
+        vec2scores_W = model.vec2scores.get_param("W")
+        vec2scores_b = model.vec2scores.get_param("b")
+        output.hidden_weights = <const float*>vec2scores_W.data
+        output.hidden_bias = <const float*>vec2scores_b.data
+    cdef np.ndarray class_mask = model._class_mask
+    output.seen_classes = <const float*>class_mask.data
+    return output
+
+
+cdef SizesC get_c_sizes(model, int batch_size) except *:
+    cdef SizesC output
+    output.states = batch_size
+    if model.vec2scores is None:
+        output.classes = model.state2vec.get_dim("nO")
+    else:
+        output.classes = model.vec2scores.get_dim("nO")
+    output.hiddens = model.state2vec.get_dim("nO")
+    output.pieces = model.state2vec.get_dim("nP")
+    output.feats = model.state2vec.get_dim("nF")
+    output.embed_width = model.tokvecs.shape[1]
+    return output
+
+
+cdef ActivationsC alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    resize_activations(&A, n)
+    return A
+
+
+cdef void free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.scores)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
+cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
+    if n.states <= A._max_size:
+        A._curr_size = n.states
+        return
+    if A._max_size == 0:
+        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
+        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
+        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
+        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    else:
+        A.token_ids = <int*>realloc(A.token_ids,
+            n.states * n.feats * sizeof(A.token_ids[0]))
+        A.scores = <float*>realloc(A.scores,
+            n.states * n.classes * sizeof(A.scores[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,
+            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>realloc(A.hiddens,
+            n.states * n.hiddens * sizeof(A.hiddens[0]))
+        A.is_valid = <int*>realloc(A.is_valid,
+            n.states * n.classes * sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    A._curr_size = n.states
+
+
+cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
+        const WeightsC* W, SizesC n) nogil:
+    cdef double one = 1.0
+    resize_activations(A, n)
+    for i in range(n.states):
+        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
+    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
+    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
+    sum_state_features(cblas, A.unmaxed,
+        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
+    for i in range(n.states):
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        for j in range(n.hiddens):
+            index = i * n.hiddens * n.pieces + j * n.pieces
+            which = _arg_max(&A.unmaxed[index], n.pieces)
+            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
+    memset(A.scores, 0, n.states * n.classes * sizeof(float))
+    if W.hidden_weights == NULL:
+        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
+    else:
+        # Compute hidden-to-output
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
+            1.0, <const float *>A.hiddens, n.hiddens,
+            <const float *>W.hidden_weights, n.hiddens,
+            0.0, A.scores, n.classes)
+        # Add bias
+        for i in range(n.states):
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
+    # Set unseen classes to minimum value
+    i = 0
+    min_ = A.scores[0]
+    for i in range(1, n.states * n.classes):
+        if A.scores[i] < min_:
+            min_ = A.scores[i]
+    for i in range(n.states):
+        for j in range(n.classes):
+            if not W.seen_classes[j]:
+                A.scores[i*n.classes+j] = min_
+
+
+cdef void sum_state_features(CBlas cblas, float* output,
+        const float* cached, const int* token_ids, int B, int F, int O) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    padding = cached
+    cached += F * O
+    cdef int id_stride = F*O
+    cdef float one = 1.
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                feature = &padding[f*O]
+            else:
+                idx = token_ids[f] * id_stride + f*O
+                feature = &cached[idx]
+            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
+        token_ids += F
+
+
+cdef void cpu_log_loss(float* d_scores,
+        const float* costs, const int* is_valid, const float* scores,
+        int O) nogil:
+    """Do multi-label log loss"""
+    cdef double max_, gmax, Z, gZ
+    best = arg_max_if_gold(scores, costs, is_valid, O)
+    guess = _arg_max(scores, O)
+
+    if best == -1 or guess == -1:
+        # These shouldn't happen, but if they do, we want to make sure we don't
+        # cause an OOB access.
+        return
+    Z = 1e-10
+    gZ = 1e-10
+    max_ = scores[guess]
+    gmax = scores[best]
+    for i in range(O):
+        Z += exp(scores[i] - max_)
+        if costs[i] <= costs[best]:
+            gZ += exp(scores[i] - gmax)
+    for i in range(O):
+        if costs[i] <= costs[best]:
+            d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
+        else:
+            d_scores[i] = exp(scores[i]-max_) / Z
+
+
+cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
+        const int* is_valid, int n) nogil:
+    # Find minimum cost
+    cdef float cost = 1
+    for i in range(n):
+        if is_valid[i] and costs[i] < cost:
+            cost = costs[i]
+    # Now find best-scoring with that cost
+    cdef int best = -1
+    for i in range(n):
+        if costs[i] <= cost and is_valid[i]:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
+
+
+cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if is_valid[i] >= 1:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
+
+
+
+class ParserStepModel(Model):
+    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
+            dropout=0.1):
+        Model.__init__(self, name="parser_step_model", forward=step_forward)
+        self.attrs["has_upper"] = has_upper
+        self.attrs["dropout_rate"] = dropout
+        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
+        if layers[1].get_dim("nP") >= 2:
+            activation = "maxout"
+        elif has_upper:
+            activation = None
+        else:
+            activation = "relu"
+        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
+                                            activation=activation, train=train)
+        if has_upper:
+            self.vec2scores = layers[-1]
+        else:
+            self.vec2scores = None
+        self.cuda_stream = util.get_cuda_stream(non_blocking=True)
+        self.backprops = []
+        self._class_mask = numpy.zeros((self.nO,), dtype='f')
+        self._class_mask.fill(1)
+        if unseen_classes is not None:
+            for class_ in unseen_classes:
+                self._class_mask[class_] = 0.
+
+    def clear_memory(self):
+        del self.tokvecs
+        del self.bp_tokvecs
+        del self.state2vec
+        del self.backprops
+        del self._class_mask
+
+    @property
+    def nO(self):
+        if self.attrs["has_upper"]:
+            return self.vec2scores.get_dim("nO")
+        else:
+            return self.state2vec.get_dim("nO")
+
+    def class_is_unseen(self, class_):
+        return self._class_mask[class_]
+
+    def mark_class_unseen(self, class_):
+        self._class_mask[class_] = 0
+
+    def mark_class_seen(self, class_):
+        self._class_mask[class_] = 1
+
+    def get_token_ids(self, states):
+        cdef StateClass state
+        states = [state for state in states if not state.is_final()]
+        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
+                                          dtype='i', order='C')
+        ids.fill(-1)
+        c_ids = <int*>ids.data
+        for state in states:
+            state.c.set_context_tokens(c_ids, ids.shape[1])
+            c_ids += ids.shape[1]
+        return ids
+
+    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
+        if isinstance(self.state2vec.ops, CupyOps) \
+        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+            # Move token_ids and d_vector to GPU, asynchronously
+            self.backprops.append((
+                util.get_async(self.cuda_stream, token_ids),
+                util.get_async(self.cuda_stream, d_vector),
+                get_d_tokvecs
+            ))
+        else:
+            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
+
+
+    def finish_steps(self, golds):
+        # Add a padding vector to the d_tokvecs gradient, so that missing
+        # values don't affect the real gradient.
+        d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
+        # Tells CUDA to block, so our async copies complete.
+        if self.cuda_stream is not None:
+            self.cuda_stream.synchronize()
+        for ids, d_vector, bp_vector in self.backprops:
+            d_state_features = bp_vector((d_vector, ids))
+            ids = ids.flatten()
+            d_state_features = d_state_features.reshape(
+                (ids.size, d_state_features.shape[2]))
+            self.ops.scatter_add(d_tokvecs, ids,
+                d_state_features)
+        # Padded -- see update()
+        self.bp_tokvecs(d_tokvecs[:-1])
+        return d_tokvecs
+
+NUMPY_OPS = NumpyOps()
+
+def step_forward(model: ParserStepModel, states, is_train):
+    token_ids = model.get_token_ids(states)
+    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
+    mask = None
+    if model.attrs["has_upper"]:
+        dropout_rate = model.attrs["dropout_rate"]
+        if is_train and dropout_rate > 0:
+            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
+            vector *= mask
+        scores, get_d_vector = model.vec2scores(vector, is_train)
+    else:
+        scores = NumpyOps().asarray(vector)
+        get_d_vector = lambda d_scores: d_scores
+    # If the class is unseen, make sure its score is minimum
+    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
+
+    def backprop_parser_step(d_scores):
+        # Zero vectors for unseen classes
+        d_scores *= model._class_mask
+        d_vector = get_d_vector(d_scores)
+        if mask is not None:
+            d_vector *= mask
+        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
+        return None
+    return scores, backprop_parser_step
+
+
+cdef class precompute_hiddens:
+    """Allow a model to be "primed" by pre-computing input features in bulk.
+
+    This is used for the parser, where we want to take a batch of documents,
+    and compute vectors for each (token, position) pair. These vectors can then
+    be reused, especially for beam-search.
+
+    Let's say we're using 12 features for each state, e.g. word at start of
+    buffer, three words on stack, their children, etc. In the normal arc-eager
+    system, a document of length N is processed in 2*N states. This means we'll
+    create 2*N*12 feature vectors --- but if we pre-compute, we only need
+    N*12 vector computations. The saving for beam-search is much better:
+    if we have a beam of k, we'll normally make 2*N*12*K computations --
+    so we can save the factor k. This also gives a nice CPU/GPU division:
+    we can do all our hard maths up front, packed into large multiplications,
+    and do the hard-to-program parsing on the CPU.
+    """
+    cdef readonly int nF, nO, nP
+    cdef bint _is_synchronized
+    cdef public object ops
+    cdef public object numpy_ops
+    cdef public object _cpu_ops
+    cdef np.ndarray _features
+    cdef np.ndarray _cached
+    cdef np.ndarray bias
+    cdef object _cuda_stream
+    cdef object _bp_hiddens
+    cdef object activation
+
+    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
+                 activation="maxout", train=False):
+        gpu_cached, bp_features = lower_model(tokvecs, train)
+        cdef np.ndarray cached
+        if not isinstance(gpu_cached, numpy.ndarray):
+            # Note the passing of cuda_stream here: it lets
+            # cupy make the copy asynchronously.
+            # We then have to block before first use.
+            cached = gpu_cached.get(stream=cuda_stream)
+        else:
+            cached = gpu_cached
+        if not isinstance(lower_model.get_param("b"), numpy.ndarray):
+            self.bias = lower_model.get_param("b").get(stream=cuda_stream)
+        else:
+            self.bias = lower_model.get_param("b")
+        self.nF = cached.shape[1]
+        if lower_model.has_dim("nP"):
+            self.nP = lower_model.get_dim("nP")
+        else:
+            self.nP = 1
+        self.nO = cached.shape[2]
+        self.ops = lower_model.ops
+        self.numpy_ops = NumpyOps()
+        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
+        assert activation in (None, "relu", "maxout")
+        self.activation = activation
+        self._is_synchronized = False
+        self._cuda_stream = cuda_stream
+        self._cached = cached
+        self._bp_hiddens = bp_features
+
+    cdef const float* get_feat_weights(self) except NULL:
+        if not self._is_synchronized and self._cuda_stream is not None:
+            self._cuda_stream.synchronize()
+            self._is_synchronized = True
+        return <float*>self._cached.data
+
+    def has_dim(self, name):
+        if name == "nF":
+            return self.nF if self.nF is not None else True
+        elif name == "nP":
+            return self.nP if self.nP is not None else True
+        elif name == "nO":
+            return self.nO if self.nO is not None else True
+        else:
+            return False
+
+    def get_dim(self, name):
+        if name == "nF":
+            return self.nF
+        elif name == "nP":
+            return self.nP
+        elif name == "nO":
+            return self.nO
+        else:
+            raise ValueError(Errors.E1033.format(name=name))
+
+    def set_dim(self, name, value):
+        if name == "nF":
+            self.nF = value
+        elif name == "nP":
+            self.nP = value
+        elif name == "nO":
+            self.nO = value
+        else:
+            raise ValueError(Errors.E1033.format(name=name))
+
+    def __call__(self, X, bint is_train):
+        if is_train:
+            return self.begin_update(X)
+        else:
+            return self.predict(X), lambda X: X
+
+    def predict(self, X):
+        return self.begin_update(X)[0]
+
+    def begin_update(self, token_ids):
+        cdef np.ndarray state_vector = numpy.zeros(
+            (token_ids.shape[0], self.nO, self.nP), dtype='f')
+        # This is tricky, but (assuming GPU available);
+        # - Input to forward on CPU
+        # - Output from forward on CPU
+        # - Input to backward on GPU!
+        # - Output from backward on GPU
+        bp_hiddens = self._bp_hiddens
+
+        cdef CBlas cblas = self._cpu_ops.cblas()
+
+        feat_weights = self.get_feat_weights()
+        cdef int[:, ::1] ids = token_ids
+        sum_state_features(cblas, <float*>state_vector.data,
+            feat_weights, &ids[0,0],
+            token_ids.shape[0], self.nF, self.nO*self.nP)
+        state_vector += self.bias
+        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
+
+        def backward(d_state_vector_ids):
+            d_state_vector, token_ids = d_state_vector_ids
+            d_state_vector = bp_nonlinearity(d_state_vector)
+            d_tokens = bp_hiddens((d_state_vector, token_ids))
+            return d_tokens
+        return state_vector, backward
+
+    def _nonlinearity(self, state_vector):
+        if self.activation == "maxout":
+            return self._maxout_nonlinearity(state_vector)
+        else:
+            return self._relu_nonlinearity(state_vector)
+
+    def _maxout_nonlinearity(self, state_vector):
+        state_vector, mask = self.numpy_ops.maxout(state_vector)
+        # We're outputting to CPU, but we need this variable on GPU for the
+        # backward pass.
+        mask = self.ops.asarray(mask)
+
+        def backprop_maxout(d_best):
+            return self.ops.backprop_maxout(d_best, mask, self.nP)
+        
+        return state_vector, backprop_maxout
+
+    def _relu_nonlinearity(self, state_vector):
+        state_vector = state_vector.reshape((state_vector.shape[0], -1))
+        mask = state_vector >= 0.
+        state_vector *= mask
+        # We're outputting to CPU, but we need this variable on GPU for the
+        # backward pass.
+        mask = self.ops.asarray(mask)
+
+        def backprop_relu(d_best):
+            d_best *= mask
+            return d_best.reshape((d_best.shape + (1,)))
+ 
+        return state_vector, backprop_relu
+
+cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
diff --git a/spacy/ml/tb_framework.pxd b/spacy/ml/tb_framework.pxd
deleted file mode 100644
index 965508519e8..00000000000
--- a/spacy/ml/tb_framework.pxd
+++ /dev/null
@@ -1,28 +0,0 @@
-from libc.stdint cimport int8_t
-
-
-cdef struct SizesC:
-    int states
-    int classes
-    int hiddens
-    int pieces
-    int feats
-    int embed_width
-    int tokens
-
-
-cdef struct WeightsC:
-    const float* feat_weights
-    const float* feat_bias
-    const float* hidden_bias
-    const float* hidden_weights
-    const int8_t* seen_mask
-
-
-cdef struct ActivationsC:
-    int* token_ids
-    float* unmaxed
-    float* hiddens
-    int* is_valid
-    int _curr_size
-    int _max_size
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
new file mode 100644
index 00000000000..ab4a969e24e
--- /dev/null
+++ b/spacy/ml/tb_framework.py
@@ -0,0 +1,50 @@
+from thinc.api import Model, noop
+from .parser_model import ParserStepModel
+from ..util import registry
+
+
+@registry.layers("spacy.TransitionModel.v1")
+def TransitionModel(
+    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
+):
+    """Set up a stepwise transition-based model"""
+    if upper is None:
+        has_upper = False
+        upper = noop()
+    else:
+        has_upper = True
+    # don't define nO for this object, because we can't dynamically change it
+    return Model(
+        name="parser_model",
+        forward=forward,
+        dims={"nI": tok2vec.maybe_get_dim("nI")},
+        layers=[tok2vec, lower, upper],
+        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
+        init=init,
+        attrs={
+            "has_upper": has_upper,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def forward(model, X, is_train):
+    step_model = ParserStepModel(
+        X,
+        model.layers,
+        unseen_classes=model.attrs["unseen_classes"],
+        train=is_train,
+        has_upper=model.attrs["has_upper"],
+    )
+
+    return step_model, step_model.finish_steps
+
+
+def init(model, X=None, Y=None):
+    model.get_ref("tok2vec").initialize(X=X)
+    lower = model.get_ref("lower")
+    lower.initialize()
+    if model.attrs["has_upper"]:
+        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
+        model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
deleted file mode 100644
index e497643f0cd..00000000000
--- a/spacy/ml/tb_framework.pyx
+++ /dev/null
@@ -1,639 +0,0 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, cast
-
-from libc.stdlib cimport calloc, free, realloc
-from libc.string cimport memcpy, memset
-from libcpp.vector cimport vector
-
-import numpy
-
-cimport numpy as np
-
-from thinc.api import (
-    Linear,
-    Model,
-    NumpyOps,
-    chain,
-    glorot_uniform_init,
-    list2array,
-    normal_init,
-    uniform_init,
-    zero_init,
-)
-
-from thinc.backends.cblas cimport CBlas, saxpy, sgemm
-
-from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
-
-from ..errors import Errors
-from ..pipeline._parser_internals import _beam_utils
-from ..pipeline._parser_internals.batch import GreedyBatch
-
-from ..pipeline._parser_internals._parser_utils cimport arg_max
-from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
-from ..pipeline._parser_internals.transition_system cimport (
-    TransitionSystem,
-    c_apply_actions,
-    c_transition_batch,
-)
-
-from ..tokens.doc import Doc
-from ..util import registry
-
-State = Any  # TODO
-
-
-@registry.layers("spacy.TransitionModel.v2")
-def TransitionModel(
-    *,
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    beam_width: int = 1,
-    beam_density: float = 0.0,
-    state_tokens: int,
-    hidden_width: int,
-    maxout_pieces: int,
-    nO: Optional[int] = None,
-    unseen_classes=set(),
-) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
-    """Set up a transition-based parsing model, using a maxout hidden
-    layer and a linear output layer.
-    """
-    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
-    tok2vec_projected.set_dim("nO", hidden_width)
-
-    # FIXME: we use `output` as a container for the output layer's
-    # weights and biases. Thinc optimizers cannot handle resizing
-    # of parameters. So, when the parser model is resized, we
-    # construct a new `output` layer, which has a different key in
-    # the optimizer. Once the optimizer supports parameter resizing,
-    # we can replace the `output` layer by `output_W` and `output_b`
-    # parameters in this model.
-    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
-
-    return Model(
-        name="parser_model",
-        forward=forward,
-        init=init,
-        layers=[tok2vec_projected, output],
-        refs={
-            "tok2vec": tok2vec_projected,
-            "output": output,
-        },
-        params={
-            "hidden_W": None,  # Floats2d W for the hidden layer
-            "hidden_b": None,  # Floats1d bias for the hidden layer
-            "hidden_pad": None,  # Floats1d padding for the hidden layer
-        },
-        dims={
-            "nO": None,  # Output size
-            "nP": maxout_pieces,
-            "nH": hidden_width,
-            "nI": tok2vec_projected.maybe_get_dim("nO"),
-            "nF": state_tokens,
-        },
-        attrs={
-            "beam_width": beam_width,
-            "beam_density": beam_density,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def resize_output(model: Model, new_nO: int) -> Model:
-    old_nO = model.maybe_get_dim("nO")
-    output = model.get_ref("output")
-    if old_nO is None:
-        model.set_dim("nO", new_nO)
-        output.set_dim("nO", new_nO)
-        output.initialize()
-        return model
-    elif new_nO <= old_nO:
-        return model
-    elif output.has_param("W"):
-        nH = model.get_dim("nH")
-        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
-        new_output.initialize()
-        new_W = new_output.get_param("W")
-        new_b = new_output.get_param("b")
-        old_W = output.get_param("W")
-        old_b = output.get_param("b")
-        new_W[:old_nO] = old_W  # type: ignore
-        new_b[:old_nO] = old_b  # type: ignore
-        for i in range(old_nO, new_nO):
-            model.attrs["unseen_classes"].add(i)
-        model.layers[-1] = new_output
-        model.set_ref("output", new_output)
-    # TODO: Avoid this private intrusion
-    model._dims["nO"] = new_nO
-    return model
-
-
-def init(
-    model,
-    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
-    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
-):
-    if X is not None:
-        docs, _ = X
-        model.get_ref("tok2vec").initialize(X=docs)
-    else:
-        model.get_ref("tok2vec").initialize()
-    inferred_nO = _infer_nO(Y)
-    if inferred_nO is not None:
-        current_nO = model.maybe_get_dim("nO")
-        if current_nO is None or current_nO != inferred_nO:
-            model.attrs["resize_output"](model, inferred_nO)
-    nP = model.get_dim("nP")
-    nH = model.get_dim("nH")
-    nI = model.get_dim("nI")
-    nF = model.get_dim("nF")
-    ops = model.ops
-
-    Wl = ops.alloc2f(nH * nP, nF * nI)
-    bl = ops.alloc1f(nH * nP)
-    padl = ops.alloc1f(nI)
-    # Wl = zero_init(ops, Wl.shape)
-    Wl = glorot_uniform_init(ops, Wl.shape)
-    padl = uniform_init(ops, padl.shape)  # type: ignore
-    # TODO: Experiment with whether better to initialize output_W
-    model.set_param("hidden_W", Wl)
-    model.set_param("hidden_b", bl)
-    model.set_param("hidden_pad", padl)
-    # model = _lsuv_init(model)
-    return model
-
-
-class TransitionModelInputs:
-    """
-    Input to transition model.
-    """
-
-    # dataclass annotation is not yet supported in Cython 0.29.x,
-    # so, we'll do something close to it.
-
-    actions: Optional[List[Ints1d]]
-    docs: List[Doc]
-    max_moves: int
-    moves: TransitionSystem
-    states: Optional[List[State]]
-
-    __slots__ = [
-        "actions",
-        "docs",
-        "max_moves",
-        "moves",
-        "states",
-    ]
-
-    def __init__(
-        self,
-        docs: List[Doc],
-        moves: TransitionSystem,
-        actions: Optional[List[Ints1d]] = None,
-        max_moves: int = 0,
-        states: Optional[List[State]] = None,
-    ):
-        """
-        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
-        docs (List[Doc]): Docs to predict transition sequences for.
-        max_moves: (int): the maximum number of moves to apply, values less
-            than 1 will apply moves to states until they are final states.
-        moves (TransitionSystem): the transition system to use when predicting
-            the transition sequences.
-        states (Optional[List[States]]): the initial states to predict the
-            transition sequences for. When absent, the initial states are
-            initialized from the provided Docs.
-        """
-        self.actions = actions
-        self.docs = docs
-        self.moves = moves
-        self.max_moves = max_moves
-        self.states = states
-
-
-def forward(model, inputs: TransitionModelInputs, is_train: bool):
-    docs = inputs.docs
-    moves = inputs.moves
-    actions = inputs.actions
-
-    beam_width = model.attrs["beam_width"]
-    hidden_pad = model.get_param("hidden_pad")
-    tok2vec = model.get_ref("tok2vec")
-
-    states = moves.init_batch(docs) if inputs.states is None else inputs.states
-    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
-    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
-    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
-    seen_mask = _get_seen_mask(model)
-
-    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
-        # Note: max_moves is only used during training, so we don't need to
-        #       pass it to the greedy inference path.
-        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
-    else:
-        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
-                                 feats, backprop_feats, seen_mask, is_train, actions=actions,
-                                 max_moves=inputs.max_moves)
-
-
-def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
-                        np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None):
-    cdef vector[StateC*] c_states
-    cdef StateClass state
-    for state in states:
-        if not state.is_final():
-            c_states.push_back(state.c)
-    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
-    # Precomputed features have rows for each token, plus one for padding.
-    cdef int n_tokens = feats.shape[0] - 1
-    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
-    cdef CBlas cblas = model.ops.cblas()
-    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
-
-    def backprop(dY):
-        raise ValueError(Errors.E4004)
-
-    return (states, scores), backprop
-
-
-cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
-                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i
-    cdef vector[StateC *] unfinished
-    cdef ActivationsC activations = _alloc_activations(sizes)
-    cdef np.ndarray step_scores
-    cdef np.ndarray step_actions
-
-    scores = []
-    while sizes.states >= 1:
-        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
-        step_actions = actions[0] if actions is not None else None
-        with nogil:
-            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
-            if actions is None:
-                # Validate actions, argmax, take action.
-                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
-                                   sizes.states)
-            else:
-                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
-            for i in range(sizes.states):
-                if not states[i].is_final():
-                    unfinished.push_back(states[i])
-            for i in range(unfinished.size()):
-                states[i] = unfinished[i]
-        sizes.states = unfinished.size()
-        scores.append(step_scores)
-        unfinished.clear()
-        actions = actions[1:] if actions is not None else None
-    _free_activations(&activations)
-
-    return scores
-
-
-def _forward_fallback(
-    model: Model,
-    moves: TransitionSystem,
-    states: List[StateClass],
-    tokvecs, backprop_tok2vec,
-    feats,
-    backprop_feats,
-    seen_mask,
-    is_train: bool,
-    actions: Optional[List[Ints1d]] = None,
-    max_moves: int = 0,
-):
-    nF = model.get_dim("nF")
-    output = model.get_ref("output")
-    hidden_b = model.get_param("hidden_b")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-
-    beam_width = model.attrs["beam_width"]
-    beam_density = model.attrs["beam_density"]
-
-    ops = model.ops
-
-    all_ids = []
-    all_which = []
-    all_statevecs = []
-    all_scores = []
-    if beam_width == 1:
-        batch = GreedyBatch(moves, states, None)
-    else:
-        batch = _beam_utils.BeamBatch(
-            moves, states, None, width=beam_width, density=beam_density
-        )
-    arange = ops.xp.arange(nF)
-    n_moves = 0
-    while not batch.is_done:
-        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
-        for i, state in enumerate(batch.get_unfinished_states()):
-            state.set_context_tokens(ids, i, nF)
-        # Sum the state features, add the bias and apply the activation (maxout)
-        # to create the state vectors.
-        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
-        preacts2f += hidden_b
-        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
-        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
-        statevecs, which = ops.maxout(preacts)
-        # We don't use output's backprop, since we want to backprop for
-        # all states at once, rather than a single state.
-        scores = output.predict(statevecs)
-        scores[:, seen_mask] = ops.xp.nanmin(scores)
-        # Transition the states, filtering out any that are finished.
-        cpu_scores = ops.to_numpy(scores)
-        if actions is None:
-            batch.advance(cpu_scores)
-        else:
-            batch.advance_with_actions(actions[0])
-            actions = actions[1:]
-        all_scores.append(scores)
-        if is_train:
-            # Remember intermediate results for the backprop.
-            all_ids.append(ids)
-            all_statevecs.append(statevecs)
-            all_which.append(which)
-        if n_moves >= max_moves >= 1:
-            break
-        n_moves += 1
-
-    def backprop_parser(d_states_d_scores):
-        ids = ops.xp.vstack(all_ids)
-        which = ops.xp.vstack(all_which)
-        statevecs = ops.xp.vstack(all_statevecs)
-        _, d_scores = d_states_d_scores
-        if model.attrs.get("unseen_classes"):
-            # If we have a negative gradient (i.e. the probability should
-            # increase) on any classes we filtered out as unseen, mark
-            # them as seen.
-            for clas in set(model.attrs["unseen_classes"]):
-                if (d_scores[:, clas] < 0).any():
-                    model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False  # no-cython-lint
-        # Calculate the gradients for the parameters of the output layer.
-        # The weight gemm is (nS, nO) @ (nS, nH).T
-        output.inc_grad("b", d_scores.sum(axis=0))
-        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
-        # Now calculate d_statevecs, by backproping through the output linear layer.
-        # This gemm is (nS, nO) @ (nO, nH)
-        output_W = output.get_param("W")
-        d_statevecs = ops.gemm(d_scores, output_W)
-        # Backprop through the maxout activation
-        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
-        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
-        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
-        # We don't need to backprop the summation, because we pass back the IDs instead
-        d_state_features = backprop_feats((d_preacts2f, ids))
-        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
-        ops.scatter_add(d_tokvecs, ids, d_state_features)
-        model.inc_grad("hidden_pad", d_tokvecs[-1])
-        return (backprop_tok2vec(d_tokvecs[:-1]), None)
-
-    return (list(batch), all_scores), backprop_parser
-
-
-def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
-    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
-    for class_ in model.attrs.get("unseen_classes", set()):
-        mask[class_] = True
-    return mask
-
-
-def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
-    W: Floats2d = model.get_param("hidden_W")
-    nF = model.get_dim("nF")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
-    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
-    W3f = W3f.transpose((1, 0, 2))
-    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
-    assert X.shape == (X.shape[0], nI), X.shape
-    Yf_ = model.ops.gemm(X, W2f, trans2=True)
-    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
-
-    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nH, nP), and get back:
-        # (nB, nH, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nH, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        dXf = model.ops.gemm(dY, W)
-        Xf = X[ids].reshape((ids.shape[0], -1))
-        dW = model.ops.gemm(dY, Xf, trans1=True)
-        model.inc_grad("hidden_W", dW)
-        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
-
-    return Yf, backward
-
-
-def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
-    if Y is None:
-        return None
-    _, scores = Y
-    if len(scores) == 0:
-        return None
-    assert scores[0].shape[0] >= 1
-    assert len(scores[0].shape) == 2
-    return scores[0].shape[1]
-
-
-def _lsuv_init(model: Model):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    W = model.maybe_get_param("hidden_W")
-    if W is not None and W.any():
-        return
-
-    nF = model.get_dim("nF")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nH, nP, nI)
-    b = model.ops.alloc2f(nH, nP)
-    pad = model.ops.alloc4f(1, nF, nH, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc_f((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc_f((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
-        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
-        vectors3f += b
-        return model.ops.maxout(vectors3f)[0]
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = cast(Floats4d, model.get_param("hidden_W").copy())
-    b = cast(Floats2d, model.get_param("hidden_b").copy())
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("hidden_W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("hidden_b", b)
-        else:
-            break
-    return model
-
-
-cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
-    output = model.get_ref("output")
-    cdef np.ndarray hidden_b = model.get_param("hidden_b")
-    cdef np.ndarray output_W = output.get_param("W")
-    cdef np.ndarray output_b = output.get_param("b")
-
-    cdef WeightsC weights
-    weights.feat_weights = feats
-    weights.feat_bias = <const float*>hidden_b.data
-    weights.hidden_weights = <const float *> output_W.data
-    weights.hidden_bias = <const float *> output_b.data
-    weights.seen_mask = <const int8_t*> seen_mask.data
-
-    return weights
-
-
-cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
-    cdef SizesC sizes
-    sizes.states = batch_size
-    sizes.classes = model.get_dim("nO")
-    sizes.hiddens = model.get_dim("nH")
-    sizes.pieces = model.get_dim("nP")
-    sizes.feats = model.get_dim("nF")
-    sizes.embed_width = model.get_dim("nI")
-    sizes.tokens = tokens
-    return sizes
-
-
-cdef ActivationsC _alloc_activations(SizesC n) nogil:
-    cdef ActivationsC A
-    memset(&A, 0, sizeof(A))
-    _resize_activations(&A, n)
-    return A
-
-
-cdef void _free_activations(const ActivationsC* A) nogil:
-    free(A.token_ids)
-    free(A.unmaxed)
-    free(A.hiddens)
-    free(A.is_valid)
-
-
-cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
-    if n.states <= A._max_size:
-        A._curr_size = n.states
-        return
-    if A._max_size == 0:
-        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
-        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
-        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    else:
-        A.token_ids = <int*>realloc(A.token_ids,
-                                    n.states * n.feats * sizeof(A.token_ids[0]))
-        A.unmaxed = <float*>realloc(A.unmaxed,
-                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>realloc(A.hiddens,
-                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
-        A.is_valid = <int*>realloc(A.is_valid,
-                                   n.states * n.classes * sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    A._curr_size = n.states
-
-
-cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
-    _resize_activations(A, n)
-    for i in range(n.states):
-        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
-    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
-    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
-    for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
-        for j in range(n.hiddens):
-            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = arg_max(&A.unmaxed[index], n.pieces)
-            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
-    if W.hidden_weights == NULL:
-        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
-    else:
-        # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-                     1.0, <const float *>A.hiddens, n.hiddens,
-                     <const float *>W.hidden_weights, n.hiddens,
-                     0.0, scores, n.classes)
-        # Add bias
-        for i in range(n.states):
-            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
-    # Set unseen classes to minimum value
-    i = 0
-    min_ = scores[0]
-    for i in range(1, n.states * n.classes):
-        if scores[i] < min_:
-            min_ = scores[i]
-    for i in range(n.states):
-        for j in range(n.classes):
-            if W.seen_mask[j]:
-                scores[i*n.classes+j] = min_
-
-
-cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
-                              const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f
-    cdef const float* feature
-    cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces  # no-cython-lint
-    cdef int F = n.feats
-    cdef int T = n.tokens
-    padding = cached + (T * F * O)
-    cdef int id_stride = F*O
-    cdef float one = 1.
-    for b in range(B):
-        for f in range(F):
-            if token_ids[f] < 0:
-                feature = &padding[f*O]
-            else:
-                idx = token_ids[f] * id_stride + f*O
-                feature = &cached[idx]
-            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
-        token_ids += F
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index 7c546752d80..273cc6c1078 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -8,8 +8,6 @@ from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
-
-from .batch cimport Batch
 from .search cimport Beam, MaxViolation
 
 from .search import MaxViolation
@@ -31,7 +29,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1:
     return state.is_final()
 
 
-cdef class BeamBatch(Batch):
+cdef class BeamBatch(object):
     cdef public TransitionSystem moves
     cdef public object states
     cdef public object docs
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pxd b/spacy/pipeline/_parser_internals/_parser_utils.pxd
deleted file mode 100644
index 7fee05bad60..00000000000
--- a/spacy/pipeline/_parser_internals/_parser_utils.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef int arg_max(const float* scores, const int n_classes) nogil
-cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pyx b/spacy/pipeline/_parser_internals/_parser_utils.pyx
deleted file mode 100644
index 582756bf5be..00000000000
--- a/spacy/pipeline/_parser_internals/_parser_utils.pyx
+++ /dev/null
@@ -1,22 +0,0 @@
-# cython: infer_types=True
-
-cdef inline int arg_max(const float* scores, const int n_classes) nogil:
-    if n_classes == 2:
-        return 0 if scores[0] > scores[1] else 1
-    cdef int i
-    cdef int best = 0
-    cdef float mode = scores[0]
-    for i in range(1, n_classes):
-        if scores[i] > mode:
-            mode = scores[i]
-            best = i
-    return best
-
-
-cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
-    cdef int best = -1
-    for i in range(n):
-        if is_valid[i] >= 1:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 1c61ac271d8..04274ce8af1 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -7,6 +7,8 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.set cimport set
+from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
@@ -26,7 +28,7 @@ cdef struct ArcC:
 
 
 cdef cppclass StateC:
-    vector[int] _heads
+    int* _heads
     const TokenC* _sent
     vector[int] _stack
     vector[int] _rebuffer
@@ -34,34 +36,31 @@ cdef cppclass StateC:
     unordered_map[int, vector[ArcC]] _left_arcs
     unordered_map[int, vector[ArcC]] _right_arcs
     vector[libcpp.bool] _unshiftable
-    vector[int] history
     set[int] _sent_starts
     TokenC _empty_token
     int length
     int offset
     int _b_i
 
-    __init__(const TokenC* sent, int length) nogil except +:
-        this._heads.resize(length, -1)
-        this._unshiftable.resize(length, False)
-
-        # Reserve memory ahead of time to minimize allocations during parsing.
-        # The initial capacity set here ideally reflects the expected average-case/majority usage.
-        cdef int init_capacity = 32
-        this._stack.reserve(init_capacity)
-        this._rebuffer.reserve(init_capacity)
-        this._ents.reserve(init_capacity)
-        this._left_arcs.reserve(init_capacity)
-        this._right_arcs.reserve(init_capacity)
-        this.history.reserve(init_capacity)
-
+    __init__(const TokenC* sent, int length) nogil:
         this._sent = sent
+        this._heads = <int*>calloc(length, sizeof(int))
+        if not (this._sent and this._heads):
+            with gil:
+                PyErr_SetFromErrno(MemoryError)
+                PyErr_CheckSignals()
         this.offset = 0
         this.length = length
         this._b_i = 0
+        for i in range(length):
+            this._heads[i] = -1
+            this._unshiftable.push_back(0)
         memset(&this._empty_token, 0, sizeof(TokenC))
         this._empty_token.lex = &EMPTY_LEXEME
 
+    __dealloc__():
+        free(this._heads)
+
     void set_context_tokens(int* ids, int n) nogil:
         cdef int i, j
         if n == 1:
@@ -134,20 +133,19 @@ cdef cppclass StateC:
                 ids[i] = -1
 
     int S(int i) nogil const:
-        cdef int stack_size = this._stack.size()
-        if i >= stack_size or i < 0:
+        if i >= this._stack.size():
             return -1
-        else:
-            return this._stack[stack_size - (i+1)]
+        elif i < 0:
+            return -1
+        return this._stack.at(this._stack.size() - (i+1))
 
     int B(int i) nogil const:
-        cdef int buf_size = this._rebuffer.size()
         if i < 0:
             return -1
-        elif i < buf_size:
-            return this._rebuffer[buf_size - (i+1)]
+        elif i < this._rebuffer.size():
+            return this._rebuffer.at(this._rebuffer.size() - (i+1))
         else:
-            b_i = this._b_i + (i - buf_size)
+            b_i = this._b_i + (i - this._rebuffer.size())
             if b_i >= this.length:
                 return -1
             else:
@@ -246,7 +244,7 @@ cdef cppclass StateC:
             return 0
         elif this._sent[word].sent_start == 1:
             return 1
-        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
+        elif this._sent_starts.count(word) >= 1:
             return 1
         else:
             return 0
@@ -330,7 +328,7 @@ cdef cppclass StateC:
         if item >= this._unshiftable.size():
             return 0
         else:
-            return this._unshiftable[item]
+            return this._unshiftable.at(item)
 
     void set_reshiftable(int item) nogil:
         if item < this._unshiftable.size():
@@ -350,9 +348,6 @@ cdef cppclass StateC:
         this._heads[child] = head
 
     void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
-        cdef vector[ArcC]* arcs
-        cdef ArcC* arc
-
         arcs_it = heads_arcs.find(h_i)
         if arcs_it == heads_arcs.end():
             return
@@ -361,12 +356,12 @@ cdef cppclass StateC:
         if arcs.size() == 0:
             return
 
-        arc = &arcs.back()
+        arc = arcs.back()
         if arc.head == h_i and arc.child == c_i:
             arcs.pop_back()
         else:
             for i in range(arcs.size()-1):
-                arc = &deref(arcs)[i]
+                arc = arcs.at(i)
                 if arc.head == h_i and arc.child == c_i:
                     arc.head = -1
                     arc.child = -1
@@ -406,11 +401,10 @@ cdef cppclass StateC:
         this._rebuffer = src._rebuffer
         this._sent_starts = src._sent_starts
         this._unshiftable = src._unshiftable
-        this._heads = src._heads
+        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
         this._ents = src._ents
         this._left_arcs = src._left_arcs
         this._right_arcs = src._right_arcs
         this._b_i = src._b_i
         this.offset = src.offset
         this._empty_token = src._empty_token
-        this.history = src.history
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 08f60b2634b..6ffceae10d3 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -779,8 +779,6 @@ cdef class ArcEager(TransitionSystem):
         return list(arcs)
 
     def has_gold(self, Example eg, start=0, end=None):
-        if end is not None and end < 0:
-            end = None
         for word in eg.y[start:end]:
             if word.dep != 0:
                 return True
@@ -865,7 +863,6 @@ cdef class ArcEager(TransitionSystem):
                             state.print_state()
                         )))
                     action.do(state.c, action.label)
-                    state.c.history.push_back(i)
                     break
             else:
                 failed = False
diff --git a/spacy/pipeline/_parser_internals/batch.pxd b/spacy/pipeline/_parser_internals/batch.pxd
deleted file mode 100644
index 60734e549aa..00000000000
--- a/spacy/pipeline/_parser_internals/batch.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef class Batch:
-    pass
diff --git a/spacy/pipeline/_parser_internals/batch.pyx b/spacy/pipeline/_parser_internals/batch.pyx
deleted file mode 100644
index 91073b52e68..00000000000
--- a/spacy/pipeline/_parser_internals/batch.pyx
+++ /dev/null
@@ -1,52 +0,0 @@
-from typing import Any
-
-TransitionSystem = Any  # TODO
-
-cdef class Batch:
-    def advance(self, scores):
-        raise NotImplementedError
-
-    def get_states(self):
-        raise NotImplementedError
-
-    @property
-    def is_done(self):
-        raise NotImplementedError
-
-    def get_unfinished_states(self):
-        raise NotImplementedError
-
-    def __getitem__(self, i):
-        raise NotImplementedError
-
-    def __len__(self):
-        raise NotImplementedError
-
-
-class GreedyBatch(Batch):
-    def __init__(self, moves: TransitionSystem, states, golds):
-        self._moves = moves
-        self._states = states
-        self._next_states = [s for s in states if not s.is_final()]
-
-    def advance(self, scores):
-        self._next_states = self._moves.transition_states(self._next_states, scores)
-
-    def advance_with_actions(self, actions):
-        self._next_states = self._moves.apply_actions(self._next_states, actions)
-
-    def get_states(self):
-        return self._states
-
-    @property
-    def is_done(self):
-        return all(s.is_final() for s in self._states)
-
-    def get_unfinished_states(self):
-        return [st for st in self._states if not st.is_final()]
-
-    def __getitem__(self, i):
-        return self._states[i]
-
-    def __len__(self):
-        return len(self._states)
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 3a352f51ff5..0b9980ddbf2 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -159,7 +159,7 @@ cdef class BiluoPushDown(TransitionSystem):
             if token.ent_type:
                 labels.add(token.ent_type_)
         return labels
-
+    
     def move_name(self, int move, attr_t label):
         if move == OUT:
             return 'O'
@@ -309,8 +309,6 @@ cdef class BiluoPushDown(TransitionSystem):
             for span in eg.y.spans.get(neg_key, []):
                 if span.start >= start and span.end <= end:
                     return True
-        if end is not None and end < 0:
-            end = None
         for word in eg.y[start:end]:
             if word.ent_iob != 0:
                 return True
@@ -646,7 +644,7 @@ cdef class Unit:
                 cost += 1
                 break
         return cost
-
+ 
 
 
 cdef class Out:
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index c2a0d22956a..f25408a13ba 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -21,10 +21,6 @@ cdef class StateClass:
         if self._borrowed != 1:
             del self.c
 
-    @property
-    def history(self):
-        return list(self.c.history)
-
     @property
     def stack(self):
         return [self.S(i) for i in range(self.c.stack_depth())]
@@ -181,6 +177,3 @@ cdef class StateClass:
 
     def clone(self, StateClass src):
         self.c.clone(src.c)
-
-    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
-        self.c.set_context_tokens(&output[row, 0], n_feats)
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 08baed932ba..04cd10d8864 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -57,10 +57,3 @@ cdef class TransitionSystem:
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        const StateC* state, gold) except -1
-
-
-cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-                          int batch_size) nogil
-
-cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-                             int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index 50b155bf9bb..485ce7c10bd 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -3,18 +3,12 @@
 from __future__ import print_function
 
 from cymem.cymem cimport Pool
-from libc.stdlib cimport calloc, free
-from libcpp.vector cimport vector
 
 from collections import Counter
 
 import srsly
 
 from ...structs cimport TokenC
-from ...tokens.doc cimport Doc
-from ...typedefs cimport attr_t, weight_t
-from . cimport _beam_utils
-from ._parser_utils cimport arg_max_if_valid
 from .stateclass cimport StateClass
 
 from ... import util
@@ -79,18 +73,7 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
-    def follow_history(self, doc, history):
-        cdef int clas
-        cdef StateClass state = StateClass(doc)
-        for clas in history:
-            action = self.c[clas]
-            action.do(state.c, action.label)
-            state.c.history.push_back(clas)
-        return state
-
     def get_oracle_sequence(self, Example example, _debug=False):
-        if not self.has_gold(example):
-            return []
         states, golds, _ = self.init_gold_batch([example])
         if not states:
             return []
@@ -102,8 +85,6 @@ cdef class TransitionSystem:
             return self.get_oracle_sequence_from_state(state, gold)
 
     def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
-        if state.is_final():
-            return []
         cdef Pool mem = Pool()
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         assert self.n_moves > 0
@@ -129,7 +110,6 @@ cdef class TransitionSystem:
                             "S0 head?", str(state.has_head(state.S(0))),
                         )))
                     action.do(state.c, action.label)
-                    state.c.history.push_back(i)
                     break
             else:
                 if _debug:
@@ -157,28 +137,6 @@ cdef class TransitionSystem:
             raise ValueError(Errors.E170.format(name=name))
         action = self.lookup_transition(name)
         action.do(state.c, action.label)
-        state.c.history.push_back(action.clas)
-
-    def apply_actions(self, states, const int[::1] actions):
-        assert len(states) == actions.shape[0]
-        cdef StateClass state
-        cdef vector[StateC*] c_states
-        c_states.resize(len(states))
-        cdef int i
-        for (i, state) in enumerate(states):
-            c_states[i] = state.c
-        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
-        return [state for state in states if not state.c.is_final()]
-
-    def transition_states(self, states, float[:, ::1] scores):
-        assert len(states) == scores.shape[0]
-        cdef StateClass state
-        cdef float* c_scores = &scores[0, 0]
-        cdef vector[StateC*] c_states
-        for state in states:
-            c_states.push_back(state.c)
-        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
-        return [state for state in states if not state.c.is_final()]
 
     cdef Transition lookup_transition(self, object name) except *:
         raise NotImplementedError
@@ -291,34 +249,3 @@ cdef class TransitionSystem:
             self.cfg.update(msg['cfg'])
         self.initialize_actions(labels)
         return self
-
-
-cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-                          int batch_size) nogil:
-    cdef int i
-    cdef Transition action
-    cdef StateC* state
-    for i in range(batch_size):
-        state = states[i]
-        action = moves.c[actions[i]]
-        action.do(state, action.label)
-        state.history.push_back(action.clas)
-
-
-cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-                             int nr_class, int batch_size) nogil:
-    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
-    cdef int i, guess
-    cdef Transition action
-    for i in range(batch_size):
-        moves.set_valid(is_valid, states[i])
-        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
-        if guess == -1:
-            # This shouldn't happen, but it's hard to raise an error here,
-            # and we don't want to infinite loop. So, force to end state.
-            states[i].force_final()
-        else:
-            action = moves.c[guess]
-            action.do(states[i], action.label)
-            states[i].history.push_back(guess)
-    free(is_valid)
diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.pyx
similarity index 96%
rename from spacy/pipeline/dep_parser.py
rename to spacy/pipeline/dep_parser.pyx
index b4961487b83..3e59deaae4f 100644
--- a/spacy/pipeline/dep_parser.py
+++ b/spacy/pipeline/dep_parser.pyx
@@ -4,6 +4,10 @@
 
 from thinc.api import Config, Model
 
+from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser cimport Parser
+from ._parser_internals.arc_eager cimport ArcEager
+
 from ..language import Language
 from ..scorer import Scorer
 from ..training import remove_bilu_prefix
@@ -17,11 +21,12 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -121,7 +126,6 @@ def make_parser(
         scorer=scorer,
     )
 
-
 @Language.factory(
     "beam_parser",
     assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
@@ -227,7 +231,6 @@ def parser_score(examples, **kwargs):
 
     DOCS: https://spacy.io/api/dependencyparser#score
     """
-
     def has_sents(doc):
         return doc.has_annotation("SENT_START")
 
@@ -235,11 +238,8 @@ def dep_getter(token, attr):
         dep = getattr(token, attr)
         dep = token.vocab.strings.as_string(dep).lower()
         return dep
-
     results = {}
-    results.update(
-        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
-    )
+    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
     kwargs.setdefault("getter", dep_getter)
     kwargs.setdefault("ignore_labels", ("p", "punct"))
     results.update(Scorer.score_deps(examples, "dep", **kwargs))
@@ -252,12 +252,11 @@ def make_parser_scorer():
     return parser_score
 
 
-class DependencyParser(Parser):
+cdef class DependencyParser(Parser):
     """Pipeline component for dependency parsing.
 
     DOCS: https://spacy.io/api/dependencyparser
     """
-
     TransitionSystem = ArcEager
 
     def __init__(
@@ -277,7 +276,8 @@ def __init__(
         incorrect_spans_key=None,
         scorer=parser_score,
     ):
-        """Create a DependencyParser."""
+        """Create a DependencyParser.
+        """
         super().__init__(
             vocab,
             model,
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.pyx
similarity index 93%
rename from spacy/pipeline/ner.py
rename to spacy/pipeline/ner.pyx
index 1c7cf151385..c73ec5c528a 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.pyx
@@ -10,15 +10,22 @@
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
+from .transition_parser cimport Parser
+from ._parser_internals.ner cimport BiluoPushDown
+from ..language import Language
+from ..scorer import get_ner_prf, PRFScore
+from ..util import registry
+from ..training import remove_bilu_prefix
+
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -43,12 +50,8 @@
         "incorrect_spans_key": None,
         "scorer": {"@scorers": "spacy.ner_scorer.v1"},
     },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+
 )
 def make_ner(
     nlp: Language,
@@ -101,7 +104,6 @@ def make_ner(
         scorer=scorer,
     )
 
-
 @Language.factory(
     "beam_ner",
     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
@@ -115,12 +117,7 @@ def make_ner(
         "incorrect_spans_key": None,
         "scorer": None,
     },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 )
 def make_beam_ner(
     nlp: Language,
@@ -194,12 +191,11 @@ def make_ner_scorer():
     return ner_score
 
 
-class EntityRecognizer(Parser):
+cdef class EntityRecognizer(Parser):
     """Pipeline component for named entity recognition.
 
     DOCS: https://spacy.io/api/entityrecognizer
     """
-
     TransitionSystem = BiluoPushDown
 
     def __init__(
@@ -217,14 +213,15 @@ def __init__(
         incorrect_spans_key=None,
         scorer=ner_score,
     ):
-        """Create an EntityRecognizer."""
+        """Create an EntityRecognizer.
+        """
         super().__init__(
             vocab,
             model,
             name,
             moves,
             update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,  # not relevant for NER
+            min_action_freq=1,   # not relevant for NER
             learn_tokens=False,  # not relevant for NER
             beam_width=beam_width,
             beam_density=beam_density,
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
new file mode 100644
index 00000000000..f20e69a6e56
--- /dev/null
+++ b/spacy/pipeline/transition_parser.pxd
@@ -0,0 +1,21 @@
+from cymem.cymem cimport Pool
+from thinc.backends.cblas cimport CBlas
+
+from ..vocab cimport Vocab
+from .trainable_pipe cimport TrainablePipe
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ._parser_internals._state cimport StateC
+from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
+
+
+cdef class Parser(TrainablePipe):
+    cdef public object _rehearsal_model
+    cdef readonly TransitionSystem moves
+    cdef public object _multitasks
+    cdef object _cpu_ops
+
+    cdef void _parseC(self, CBlas cblas, StateC** states,
+            WeightsC weights, SizesC sizes) nogil
+
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
+            int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 99970b3fe93..4290420c788 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,16 +1,21 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
-
 from typing import Dict, Iterable, List, Optional, Tuple
-
-cimport numpy as np
 from cymem.cymem cimport Pool
-
-import contextlib
-import random
+cimport numpy as np
 from itertools import islice
+from libcpp.vector cimport vector
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free
+import random
 
+import srsly
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
+from thinc.api import chain, softmax_activation, use_ops
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d
+import numpy.random
 import numpy
 import numpy.random
 import srsly
@@ -24,7 +29,16 @@ from thinc.api import (
 )
 from thinc.types import Floats2d, Ints1d
 
-from ..ml.tb_framework import TransitionModelInputs
+from ._parser_internals.stateclass cimport StateClass
+from ._parser_internals.search cimport Beam
+from ..ml.parser_model cimport alloc_activations, free_activations
+from ..ml.parser_model cimport predict_states, arg_max_if_valid
+from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
+from ..ml.parser_model cimport get_c_weights, get_c_sizes
+from ..tokens.doc cimport Doc
+from .trainable_pipe import TrainablePipe
+from ._parser_internals cimport _beam_utils
+from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
 from ..typedefs cimport weight_t
@@ -52,7 +66,7 @@ cdef extern from "<algorithm>" namespace "std" nogil:
 NUMPY_OPS = NumpyOps()
 
 
-class Parser(TrainablePipe):
+cdef class Parser(TrainablePipe):
     """
     Base class of the DependencyParser and EntityRecognizer.
     """
@@ -152,9 +166,8 @@ class Parser(TrainablePipe):
     @property
     def move_names(self):
         names = []
-        cdef TransitionSystem moves = self.moves
         for i in range(self.moves.n_moves):
-            name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
+            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
             # Explicitly removing the internal "U-" token used for blocking entities
             if name != "U-":
                 names.append(name)
@@ -261,6 +274,15 @@ class Parser(TrainablePipe):
 
         student_docs = [eg.predicted for eg in examples]
 
+        teacher_step_model = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_step_model, backprop_tok2vec = self.model.begin_update(student_docs)
+
+        # Add softmax activation, so that we can compute student losses
+        # with cross-entropy loss.
+        with use_ops("numpy"):
+            teacher_model = chain(teacher_step_model, softmax_activation())
+            student_model = chain(student_step_model, softmax_activation())
+        
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
@@ -268,38 +290,50 @@ class Parser(TrainablePipe):
             # sequence, we use the teacher's predictions as the gold
             # standard.
             max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states = self._init_batch(teacher_pipe, student_docs, max_moves)
+            states = self._init_batch(teacher_step_model, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
-        # We distill as follows: 1. we first let the student predict transition
-        # sequences (and the corresponding transition probabilities); (2) we
-        # let the teacher follow the student's predicted transition sequences
-        # to obtain the teacher's transition probabilities; (3) we compute the
-        # gradients of the student's transition distributions relative to the
-        # teacher's distributions.
-
-        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
-            max_moves=max_moves)
-        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
-        teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-            moves=self.moves, actions=actions)
-        (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
+        loss = 0.0
+        n_moves = 0
+        while states:
+            # We do distillation as follows: (1) for every state, we compute the
+            # transition softmax distributions: (2) we backpropagate the error of
+            # the student (compared to the teacher) into the student model; (3)
+            # for all states, we move to the next state using the student's
+            # predictions.
+            teacher_scores = teacher_model.predict(states)
+            student_scores, backprop = student_model.begin_update(states)
+            state_loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+            backprop(d_scores)
+            loss += state_loss
+            self.transition_states(states, student_scores)
+            states = [state for state in states if not state.is_final()]
+
+            # Stop when we reach the maximum number of moves, otherwise we start
+            # to process the remainder of cut sequences again.
+            if max_moves >= 1 and n_moves >= max_moves:
+                break
+            n_moves += 1
 
-        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
-        backprop_scores((student_states, d_scores))
+        backprop_tok2vec(student_docs)
 
         if sgd is not None:
             self.finish_update(sgd)
 
         losses[self.name] += loss
 
+        del backprop
+        del backprop_tok2vec
+        teacher_step_model.clear_memory()
+        student_step_model.clear_memory()
+        del teacher_model
+        del student_model
+
         return losses
 
     def get_teacher_student_loss(
-            self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
-            normalize: bool = False,
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
     ) -> Tuple[float, List[Floats2d]]:
         """Calculate the loss and its gradient for a batch of student
         scores, relative to teacher scores.
@@ -311,28 +345,10 @@ class Parser(TrainablePipe):
         
         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
         """
-
-        # We can't easily hook up a softmax layer in the parsing model, since
-        # the get_loss does additional masking. So, we could apply softmax
-        # manually here and use Thinc's cross-entropy loss. But it's a bit
-        # suboptimal, since we can have a lot of states that would result in
-        # many kernel launches. Futhermore the parsing model's backprop expects
-        # a XP array, so we'd have to concat the softmaxes anyway. So, like
-        # the get_loss implementation, we'll compute the loss and gradients
-        # ourselves.
-
-        teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
-                                                axis=-1, inplace=True)
-        student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
-                                                axis=-1, inplace=True)
-
-        assert teacher_scores.shape == student_scores.shape
-
-        d_scores = student_scores - teacher_scores
-        if normalize:
-            d_scores /= d_scores.shape[0]
-        loss = (d_scores**2).sum() / d_scores.size
-
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
         return float(loss), d_scores
 
     def init_multitask_objectives(self, get_examples, pipeline, **cfg):
@@ -355,6 +371,9 @@ class Parser(TrainablePipe):
 
         stream: The sequence of documents to process.
         batch_size (int): Number of documents to accumulate into a working set.
+        error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
+            deals with a failing batch of documents. The default function just reraises
+            the exception.
 
         YIELDS (Doc): Documents, in order.
         """
@@ -375,29 +394,78 @@ class Parser(TrainablePipe):
     def predict(self, docs):
         if isinstance(docs, Doc):
             docs = [docs]
-        self._ensure_labels_are_added(docs)
         if not any(len(doc) for doc in docs):
             result = self.moves.init_batch(docs)
             return result
-        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
-            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-            states_or_beams, _ = self.model.predict(inputs)
-        return states_or_beams
+        if self.cfg["beam_width"] == 1:
+            return self.greedy_parse(docs, drop=0.0)
+        else:
+            return self.beam_parse(
+                docs,
+                drop=0.0,
+                beam_width=self.cfg["beam_width"],
+                beam_density=self.cfg["beam_density"]
+            )
 
     def greedy_parse(self, docs, drop=0.):
-        self._resize()
+        cdef vector[StateC*] states
+        cdef StateClass state
+        cdef CBlas cblas = self._cpu_ops.cblas()
         self._ensure_labels_are_added(docs)
-        with _change_attrs(self.model, beam_width=1):
-            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-            states, _ = self.model.predict(inputs)
-        return states
+        set_dropout_rate(self.model, drop)
+        batch = self.moves.init_batch(docs)
+        model = self.model.predict(docs)
+        weights = get_c_weights(model)
+        for state in batch:
+            if not state.is_final():
+                states.push_back(state.c)
+        sizes = get_c_sizes(model, states.size())
+        with nogil:
+            self._parseC(cblas, &states[0], weights, sizes)
+        model.clear_memory()
+        del model
+        return batch
 
     def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
+        cdef Beam beam
+        cdef Doc doc
         self._ensure_labels_are_added(docs)
-        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
-            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-            beams, _ = self.model.predict(inputs)
-        return beams
+        batch = _beam_utils.BeamBatch(
+            self.moves,
+            self.moves.init_batch(docs),
+            None,
+            beam_width,
+            density=beam_density
+        )
+        model = self.model.predict(docs)
+        while not batch.is_done:
+            states = batch.get_unfinished_states()
+            if not states:
+                break
+            scores = model.predict(states)
+            batch.advance(scores)
+        model.clear_memory()
+        del model
+        return list(batch)
+
+    cdef void _parseC(self, CBlas cblas, StateC** states,
+            WeightsC weights, SizesC sizes) nogil:
+        cdef int i, j
+        cdef vector[StateC*] unfinished
+        cdef ActivationsC activations = alloc_activations(sizes)
+        while sizes.states >= 1:
+            predict_states(cblas, &activations, states, &weights, sizes)
+            # Validate actions, argmax, take action.
+            self.c_transition_batch(states,
+                activations.scores, sizes.classes, sizes.states)
+            for i in range(sizes.states):
+                if not states[i].is_final():
+                    unfinished.push_back(states[i])
+            for i in range(unfinished.size()):
+                states[i] = unfinished[i]
+            sizes.states = unfinished.size()
+            unfinished.clear()
+        free_activations(&activations)
 
     def set_annotations(self, docs, states_or_beams):
         cdef StateClass state
@@ -408,6 +476,35 @@ class Parser(TrainablePipe):
             for hook in self.postprocesses:
                 hook(doc)
 
+    def transition_states(self, states, float[:, ::1] scores):
+        cdef StateClass state
+        cdef float* c_scores = &scores[0, 0]
+        cdef vector[StateC*] c_states
+        for state in states:
+            c_states.push_back(state.c)
+        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
+        return [state for state in states if not state.c.is_final()]
+
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
+            int nr_class, int batch_size) nogil:
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        with gil:
+            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
+        is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
+        cdef int i, guess
+        cdef Transition action
+        for i in range(batch_size):
+            self.moves.set_valid(is_valid, states[i])
+            guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
+            if guess == -1:
+                # This shouldn't happen, but it's hard to raise an error here,
+                # and we don't want to infinite loop. So, force to end state.
+                states[i].force_final()
+            else:
+                action = self.moves.c[guess]
+                action.do(states[i], action.label)
+        free(is_valid)
+
     def update(self, examples, *, drop=0., sgd=None, losses=None):
         if losses is None:
             losses = {}
@@ -418,99 +515,67 @@ class Parser(TrainablePipe):
         )
         for multitask in self._multitasks:
             multitask.update(examples, drop=drop, sgd=sgd)
-        # We need to take care to act on the whole batch, because we might be
-        # getting vectors via a listener.
         n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
         if n_examples == 0:
             return losses
         set_dropout_rate(self.model, drop)
-        docs = [eg.x for eg in examples if len(eg.x)]
-
+        # The probability we use beam update, instead of falling back to
+        # a greedy update
+        beam_update_prob = self.cfg["beam_update_prob"]
+        if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
+            return self.update_beam(
+                examples,
+                beam_width=self.cfg["beam_width"],
+                sgd=sgd,
+                losses=losses,
+                beam_density=self.cfg["beam_density"]
+            )
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
             # batch uniform length.
-            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
-            init_states, gold_states, _ = self._init_gold_batch(
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            states, golds, _ = self._init_gold_batch(
                 examples,
                 max_length=max_moves
             )
         else:
-            init_states, gold_states, _ = self.moves.init_gold_batch(examples)
-
-        inputs = TransitionModelInputs(docs=docs,
-                                       moves=self.moves,
-                                       max_moves=max_moves,
-                                       states=[state.copy() for state in init_states])
-        (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
-        if sum(s.shape[0] for s in scores) == 0:
+            states, golds, _ = self.moves.init_gold_batch(examples)
+        if not states:
             return losses
-        d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
-                                 examples, max_moves)
-        backprop_scores((pred_states, d_scores))
+        model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
+ 
+        all_states = list(states)
+        states_golds = list(zip(states, golds))
+        n_moves = 0
+        while states_golds:
+            states, golds = zip(*states_golds)
+            scores, backprop = model.begin_update(states)
+            d_scores = self.get_batch_loss(states, golds, scores, losses)
+            # Note that the gradient isn't normalized by the batch size
+            # here, because our "samples" are really the states...But we
+            # can't normalize by the number of states either, as then we'd
+            # be getting smaller gradients for states in long sequences.
+            backprop(d_scores)
+            # Follow the predicted action
+            self.transition_states(states, scores)
+            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
+            if max_moves >= 1 and n_moves >= max_moves:
+                break
+            n_moves += 1
+
+        backprop_tok2vec(golds)
         if sgd not in (None, False):
             self.finish_update(sgd)
-        losses[self.name] += float((d_scores**2).sum())
         # Ugh, this is annoying. If we're working on GPU, we want to free the
         # memory ASAP. It seems that Python doesn't necessarily get around to
         # removing these in time if we don't explicitly delete? It's confusing.
-        del backprop_scores
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        del model
         return losses
 
-    def get_loss(self, states_scores, examples, max_moves):
-        gold_states, init_states, pred_states, scores = states_scores
-        scores = self.model.ops.xp.vstack(scores)
-        costs = self._get_costs_from_histories(
-            examples,
-            gold_states,
-            init_states,
-            [list(state.history) for state in pred_states],
-            max_moves
-        )
-        xp = get_array_module(scores)
-        best_costs = costs.min(axis=1, keepdims=True)
-        gscores = scores.copy()
-        min_score = scores.min() - 1000
-        assert costs.shape == scores.shape, (costs.shape, scores.shape)
-        gscores[costs > best_costs] = min_score
-        max_ = scores.max(axis=1, keepdims=True)
-        gmax = gscores.max(axis=1, keepdims=True)
-        exp_scores = xp.exp(scores - max_)
-        exp_gscores = xp.exp(gscores - gmax)
-        Z = exp_scores.sum(axis=1, keepdims=True)
-        gZ = exp_gscores.sum(axis=1, keepdims=True)
-        d_scores = exp_scores / Z
-        d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
-        return d_scores
-
-    def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
-        cdef TransitionSystem moves = self.moves
-        cdef StateClass state
-        cdef int clas
-        cdef int nO = moves.n_moves
-        cdef Pool mem = Pool()
-        cdef np.ndarray costs_i
-        is_valid = <int*>mem.alloc(nO, sizeof(int))
-        batch = list(zip(init_states, histories, gold_states))
-        n_moves = 0
-        output = []
-        while batch:
-            costs = numpy.zeros((len(batch), nO), dtype="f")
-            for i, (state, history, gold) in enumerate(batch):
-                costs_i = costs[i]
-                clas = history.pop(0)
-                moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
-                action = moves.c[clas]
-                action.do(state.c, action.label)
-                state.c.history.push_back(clas)
-            output.append(costs)
-            batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
-            if n_moves >= max_moves >= 1:
-                break
-            n_moves += 1
-
-        return self.model.ops.xp.vstack(output)
-
     def rehearse(self, examples, sgd=None, losses=None, **cfg):
         """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
         if losses is None:
@@ -520,9 +585,10 @@ class Parser(TrainablePipe):
                 multitask.rehearse(examples, losses=losses, sgd=sgd)
         if self._rehearsal_model is None:
             return None
-        losses.setdefault(self.name, 0.0)
+        losses.setdefault(self.name, 0.)
         validate_examples(examples, "Parser.rehearse")
         docs = [eg.predicted for eg in examples]
+        states = self.moves.init_batch(docs)
         # This is pretty dirty, but the NER can resize itself in init_batch,
         # if labels are missing. We therefore have to check whether we need to
         # expand our model output.
@@ -530,33 +596,85 @@ class Parser(TrainablePipe):
         # Prepare the stepwise model, and get the callback for finishing the batch
         set_dropout_rate(self._rehearsal_model, 0.0)
         set_dropout_rate(self.model, 0.0)
-        student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
-        teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
-        _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
-
-        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
-
-        teacher_scores = self.model.ops.xp.vstack(teacher_scores)
-        student_scores = self.model.ops.xp.vstack(student_scores)
-        assert teacher_scores.shape == student_scores.shape
-
-        d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
-        # If all weights for an output are 0 in the original model, don't
-        # supervise that output. This allows us to add classes.
-        loss = (d_scores**2).sum() / d_scores.size
-        backprop_scores((student_states, d_scores))
-
+        tutor, _ = self._rehearsal_model.begin_update(docs)
+        model, backprop_tok2vec = self.model.begin_update(docs)
+        n_scores = 0.
+        loss = 0.
+        while states:
+            targets, _ = tutor.begin_update(states)
+            guesses, backprop = model.begin_update(states)
+            d_scores = (guesses - targets) / targets.shape[0]
+            # If all weights for an output are 0 in the original model, don't
+            # supervise that output. This allows us to add classes.
+            loss += (d_scores**2).sum()
+            backprop(d_scores)
+            # Follow the predicted action
+            self.transition_states(states, guesses)
+            states = [state for state in states if not state.is_final()]
+            n_scores += d_scores.size
+        # Do the backprop
+        backprop_tok2vec(docs)
         if sgd is not None:
             self.finish_update(sgd)
-        losses[self.name] += loss
-
+        losses[self.name] += loss / n_scores
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        tutor.clear_memory()
+        del model
+        del tutor
         return losses
 
-    def update_beam(self, examples, *, beam_width, drop=0.,
-                    sgd=None, losses=None, beam_density=0.0):
-        raise NotImplementedError
+    def update_beam(self, examples, *, beam_width,
+            drop=0., sgd=None, losses=None, beam_density=0.0):
+        states, golds, _ = self.moves.init_gold_batch(examples)
+        if not states:
+            return losses
+        # Prepare the stepwise model, and get the callback for finishing the batch
+        model, backprop_tok2vec = self.model.begin_update(
+            [eg.predicted for eg in examples])
+        loss = _beam_utils.update_beam(
+            self.moves,
+            states,
+            golds,
+            model,
+            beam_width,
+            beam_density=beam_density,
+        )
+        losses[self.name] += loss
+        backprop_tok2vec(golds)
+        if sgd is not None:
+            self.finish_update(sgd)
+
+    def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
+        cdef StateClass state
+        cdef Pool mem = Pool()
+        cdef int i
+
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
+
+        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
+        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
+        cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
+                                        dtype='f', order='C')
+        c_d_scores = <float*>d_scores.data
+        unseen_classes = self.model.attrs["unseen_classes"]
+        for i, (state, gold) in enumerate(zip(states, golds)):
+            memset(is_valid, 0, self.moves.n_moves * sizeof(int))
+            memset(costs, 0, self.moves.n_moves * sizeof(float))
+            self.moves.set_costs(is_valid, costs, state.c, gold)
+            for j in range(self.moves.n_moves):
+                if costs[j] <= 0.0 and j in unseen_classes:
+                    unseen_classes.remove(j)
+            cpu_log_loss(c_d_scores,
+                costs, is_valid, &scores[i, 0], d_scores.shape[1])
+            c_d_scores += d_scores.shape[1]
+        # Note that we don't normalize this. See comment in update() for why.
+        if losses is not None:
+            losses.setdefault(self.name, 0.)
+            losses[self.name] += (d_scores**2).sum()
+        return d_scores
 
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
@@ -595,7 +713,7 @@ class Parser(TrainablePipe):
             for example in islice(get_examples(), 10):
                 doc_sample.append(example.predicted)
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize((doc_sample, self.moves))
+        self.model.initialize(doc_sample)
         if nlp is not None:
             self.init_multitask_objectives(get_examples, nlp.pipeline)
 
@@ -688,27 +806,26 @@ class Parser(TrainablePipe):
 
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
-        sequence or a cap. A long doc will get multiple states. Let's say we
-        have a doc of length 2*N, where N is the shortest doc. We'll make
-        two states, one representing long_doc[:N], and another representing
-        long_doc[N:]."""
+        sequence or a cap. A long
+        doc will get multiple states. Let's say we have a doc of length 2*N,
+        where N is the shortest doc. We'll make two states, one representing
+        long_doc[:N], and another representing long_doc[N:]."""
         cdef:
             StateClass start_state
             StateClass state
             Transition action
-            TransitionSystem moves = self.moves
-        all_states = moves.init_batch([eg.predicted for eg in examples])
+        all_states = self.moves.init_batch([eg.predicted for eg in examples])
         states = []
         golds = []
         to_cut = []
         for state, eg in zip(all_states, examples):
-            if moves.has_gold(eg) and not state.is_final():
-                gold = moves.init_gold(state, eg)
+            if self.moves.has_gold(eg) and not state.is_final():
+                gold = self.moves.init_gold(state, eg)
                 if len(eg.x) < max_length:
                     states.append(state)
                     golds.append(gold)
                 else:
-                    oracle_actions = moves.get_oracle_sequence_from_state(
+                    oracle_actions = self.moves.get_oracle_sequence_from_state(
                         state.copy(), gold)
                     to_cut.append((eg, state, gold, oracle_actions))
         if not to_cut:
@@ -718,52 +835,13 @@ class Parser(TrainablePipe):
             for i in range(0, len(oracle_actions), max_length):
                 start_state = state.copy()
                 for clas in oracle_actions[i:i+max_length]:
-                    action = moves.c[clas]
+                    action = self.moves.c[clas]
                     action.do(state.c, action.label)
                     if state.is_final():
                         break
-                if moves.has_gold(eg, start_state.B(0), state.B(0)):
+                if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
                     states.append(start_state)
                     golds.append(gold)
                 if state.is_final():
                     break
         return states, golds, max_length
-
-
-@contextlib.contextmanager
-def _change_attrs(model, **kwargs):
-    """Temporarily modify a thinc model's attributes."""
-    unset = object()
-    old_attrs = {}
-    for key, value in kwargs.items():
-        old_attrs[key] = model.attrs.get(key, unset)
-        model.attrs[key] = value
-    yield model
-    for key, value in old_attrs.items():
-        if value is unset:
-            model.attrs.pop(key)
-        else:
-            model.attrs[key] = value
-
-
-def states2actions(states: List[StateClass]) -> List[Ints1d]:
-    cdef int step
-    cdef StateClass state
-    cdef StateC* c_state
-    actions = []
-    while True:
-        step = len(actions)
-
-        step_actions = []
-        for state in states:
-            c_state = state.c
-            if step < c_state.history.size():
-                step_actions.append(c_state.history[step])
-
-        # We are done if we have exhausted all histories.
-        if len(step_actions) == 0:
-            break
-
-        actions.append(numpy.array(step_actions, dtype="i"))
-
-    return actions
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index bb9b7653ce3..5d5fbb02790 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,6 +17,7 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
+import logging
 
 from ..util import make_tempdir
 
@@ -413,7 +414,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.initialize(get_examples=lambda: train_examples)
+    nlp.initialize()
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -540,11 +541,11 @@ def test_block_ner():
     assert [token.ent_type_ for token in doc] == expected_types
 
 
-def test_overfitting_IO():
-    fix_random_seed(1)
+@pytest.mark.parametrize("use_upper", [True, False])
+def test_overfitting_IO(use_upper):
     # Simple test to try and quickly overfit the NER component
     nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {}})
+    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -576,6 +577,7 @@ def test_overfitting_IO():
         assert ents2[0].label_ == "LOC"
         # Ensure that the predictions are still the same, even after adding a new label
         ner2 = nlp2.get_pipe("ner")
+        assert ner2.model.attrs["has_upper"] == use_upper
         ner2.add_label("RANDOM_NEW_LABEL")
         doc3 = nlp2(test_text)
         ents3 = doc3.ents
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index d25eb165acb..42cf5ced998 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,6 +1,3 @@
-import itertools
-
-import numpy
 import pytest
 from numpy.testing import assert_equal
 from thinc.api import Adam, fix_random_seed
@@ -62,8 +59,6 @@
     ),
 ]
 
-PARSERS = ["parser"]  # TODO: Test beam_parser when ready
-
 eps = 0.1
 
 
@@ -176,57 +171,6 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
     assert doc[0].dep != 0
 
 
-def test_parser_apply_actions(en_vocab, en_parser):
-    words = ["I", "ate", "pizza"]
-    words2 = ["Eat", "more", "pizza", "!"]
-    doc1 = Doc(en_vocab, words=words)
-    doc2 = Doc(en_vocab, words=words2)
-    docs = [doc1, doc2]
-
-    moves = en_parser.moves
-    moves.add_action(0, "")
-    moves.add_action(1, "")
-    moves.add_action(2, "nsubj")
-    moves.add_action(3, "obj")
-    moves.add_action(2, "amod")
-
-    actions = [
-        numpy.array([0, 0], dtype="i"),
-        numpy.array([2, 0], dtype="i"),
-        numpy.array([0, 4], dtype="i"),
-        numpy.array([3, 3], dtype="i"),
-        numpy.array([1, 1], dtype="i"),
-        numpy.array([1, 1], dtype="i"),
-        numpy.array([0], dtype="i"),
-        numpy.array([1], dtype="i"),
-    ]
-
-    states = moves.init_batch(docs)
-    active_states = states
-
-    for step_actions in actions:
-        active_states = moves.apply_actions(active_states, step_actions)
-
-    assert len(active_states) == 0
-
-    for (state, doc) in zip(states, docs):
-        moves.set_annotations(state, doc)
-
-    assert docs[0][0].head.i == 1
-    assert docs[0][0].dep_ == "nsubj"
-    assert docs[0][1].head.i == 1
-    assert docs[0][1].dep_ == "ROOT"
-    assert docs[0][2].head.i == 1
-    assert docs[0][2].dep_ == "obj"
-
-    assert docs[1][0].head.i == 0
-    assert docs[1][0].dep_ == "ROOT"
-    assert docs[1][1].head.i == 2
-    assert docs[1][1].dep_ == "amod"
-    assert docs[1][2].head.i == 0
-    assert docs[1][2].dep_ == "obj"
-
-
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
@@ -375,7 +319,7 @@ def test_parser_constructor(en_vocab):
     DependencyParser(en_vocab, model)
 
 
-@pytest.mark.parametrize("pipe_name", PARSERS)
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
 def test_incomplete_data(pipe_name):
     # Test that the parser works with incomplete information
     nlp = English()
@@ -401,15 +345,11 @@ def test_incomplete_data(pipe_name):
     assert doc[2].head.i == 1
 
 
-@pytest.mark.parametrize(
-    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
-)
-def test_overfitting_IO(pipe_name, max_moves):
-    fix_random_seed(0)
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+def test_overfitting_IO(pipe_name):
     # Simple test to try and quickly overfit the dependency parser (normal or beam)
     nlp = English()
     parser = nlp.add_pipe(pipe_name)
-    parser.cfg["update_with_oracle_cut_size"] = max_moves
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -511,12 +451,10 @@ def test_distill():
 @pytest.mark.parametrize(
     "parser_config",
     [
-        # TODO: re-enable after we have a spacy-legacy release for v4. See
-        # https://github.com/explosion/spacy-legacy/pull/36
-        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        # TransitionBasedParser V1
+        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        # TransitionBasedParser V2
         ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
-        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
     ],
 )
 # fmt: on
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index f6cefbc1f84..c3c4bb6c686 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -384,7 +384,7 @@ def test_replace_listeners():
     factory = "ner"
 
     [components.ner.model]
-    @architectures = "spacy.TransitionBasedParser.v3"
+    @architectures = "spacy.TransitionBasedParser.v2"
 
     [components.ner.model.tok2vec]
     @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index b351ea80121..8a1c74ca9ed 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -189,11 +189,33 @@
 
 parser_config_string_upper = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
+use_upper = true
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 333
+depth = 4
+embed_size = 5555
+window_size = 1
+maxout_pieces = 7
+subword_features = false
+"""
+
+
+parser_config_string_no_upper = """
+[model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "parser"
+extra_state_tokens = false
+hidden_width = 66
+maxout_pieces = 2
+use_upper = false
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
@@ -224,6 +246,7 @@ def my_parser():
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
+        use_upper=True,
     )
     return parser
 
@@ -337,16 +360,15 @@ def test_serialize_custom_nlp():
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        assert model.get_ref("tok2vec") is not None
-        assert model.has_param("hidden_W")
-        assert model.has_param("hidden_b")
-        output = model.get_ref("output")
-        assert output is not None
-        assert output.has_param("W")
-        assert output.has_param("b")
+        model.get_ref("tok2vec")
+        # check that we have the correct settings, not the default ones
+        assert model.get_ref("upper").get_dim("nI") == 65
+        assert model.get_ref("lower").get_dim("nI") == 65
 
 
-@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
+@pytest.mark.parametrize(
+    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
+)
 def test_serialize_parser(parser_config_string):
     """Create a non-default parser config to check nlp serializes it correctly"""
     nlp = English()
@@ -359,13 +381,11 @@ def test_serialize_parser(parser_config_string):
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        assert model.get_ref("tok2vec") is not None
-        assert model.has_param("hidden_W")
-        assert model.has_param("hidden_b")
-        output = model.get_ref("output")
-        assert output is not None
-        assert output.has_param("b")
-        assert output.has_param("W")
+        model.get_ref("tok2vec")
+        # check that we have the correct settings, not the default ones
+        if model.attrs["has_upper"]:
+            assert model.get_ref("upper").get_dim("nI") == 66
+        assert model.get_ref("lower").get_dim("nI") == 66
 
 
 def test_config_nlp_roundtrip():
@@ -561,7 +581,9 @@ def test_config_auto_fill_extra_fields():
     load_model_from_config(nlp.config)
 
 
-@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
+@pytest.mark.parametrize(
+    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
+)
 def test_config_validate_literal(parser_config_string):
     nlp = English()
     config = Config().from_str(parser_config_string)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index c05ef625e11..44717f6eba2 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,19 +1,22 @@
 import ctypes
 import os
 from pathlib import Path
-
 import pytest
-from pydantic import ValidationError
-from thinc.api import (
-    Config,
-    ConfigValidationError,
-    CupyOps,
-    MPSOps,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-    set_current_ops,
-)
+
+try:
+    from pydantic.v1 import ValidationError
+except ImportError:
+    from pydantic import ValidationError  # type: ignore
+
+from spacy.about import __version__ as spacy_version
+from spacy import util
+from spacy import prefer_gpu, require_gpu, require_cpu
+from spacy.ml._precomputable_affine import PrecomputableAffine
+from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
+from spacy.util import dot_to_object, SimpleFrozenList, import_file
+from spacy.util import to_ternary_int, find_available_port
+from thinc.api import Config, Optimizer, ConfigValidationError
+from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
@@ -96,6 +99,34 @@ def test_util_get_package_path(package):
     assert isinstance(path, Path)
 
 
+def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
+    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
+    assert model.get_param("W").shape == (nF, nO, nP, nI)
+    tensor = model.ops.alloc((10, nI))
+    Y, get_dX = model.begin_update(tensor)
+    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
+    dY = model.ops.alloc((15, nO, nP))
+    ids = model.ops.alloc((15, nF))
+    ids[1, 2] = -1
+    dY[1] = 1
+    assert not model.has_grad("pad")
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 2, 0, 0] == 1.0
+    ids.fill(0.0)
+    dY.fill(0.0)
+    dY[0] = 0
+    ids[1, 2] = 0
+    ids[1, 1] = -1
+    ids[1, 0] = -1
+    dY[1] = 1
+    ids[2, 0] = -1
+    dY[2] = 5
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 0, 0, 0] == 6
+    assert d_pad[0, 1, 0, 0] == 1
+    assert d_pad[0, 2, 0, 0] == 0
+
+
 def test_prefer_gpu():
     current_ops = get_current_ops()
     if has_cupy_gpu:
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index c6da5157748..d1b69e1816a 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,6 +1,6 @@
 # cython: profile=False
 from collections.abc import Iterable as IterableInstance
-
+import warnings
 import numpy
 
 from murmurhash.mrmr cimport hash64
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index db8f974ea19..956234ac0d4 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -833,17 +833,18 @@ for a Tok2Vec layer.
 
 ## Parser & NER architectures {id="parser"}
 
-### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
+### spacy.TransitionBasedParser.v2 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TransitionBasedParser.v3"
+> @architectures = "spacy.TransitionBasedParser.v2"
 > state_type = "ner"
 > extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
+> use_upper = true
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v2"
@@ -873,22 +874,23 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                 | Description                                                                                                                                                       |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                        |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                               |
-| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
-| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                            |
-| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~                                                             |
-| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                       |
-| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                     |
+| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
+| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
 
 <Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>
 
 [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
 same signature, but the `use_upper` argument was `True` by default.
 
- </Accordion>
+</Accordion>
 
 ## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 765bcb8c675..1fae1dc6cda 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -406,7 +406,7 @@ Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v3
+Name       spacy.TransitionBasedParser.v1
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
@@ -416,7 +416,7 @@ Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v3
+Name       spacy.TransitionBasedParser.v1
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]
@@ -741,7 +741,7 @@ scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 - hidden_width = 64
@@ -764,7 +764,7 @@ scorer = {"@scorers":"spacy.parser_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx
index 44c80622437..b44df538766 100644
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@@ -302,7 +302,7 @@ the others, but may not be as accurate, especially if texts are short.
 ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 
 Identical to
-[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser)
+[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
 except the `use_upper` was set to `True` by default.
 
 ## Layers {id="layers"}
diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 1b0bc9606e9..2bd2856b6a3 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -140,7 +140,7 @@ factory = "tok2vec"
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v1"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -156,7 +156,7 @@ same. This makes them fully independent and doesn't require an upstream
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v1"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
@@ -472,7 +472,7 @@ sneakily delegates to the `Transformer` pipeline component.
 factory = "ner"
 
 [nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v1"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 128

From bd2f254dde0ee0d8e0ba56068bf02a295666cb70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:24:09 +0100
Subject: [PATCH 308/504] isort

---
 spacy/ml/models/parser.py                   |  9 +++--
 spacy/ml/parser_model.pxd                   |  5 ++-
 spacy/ml/parser_model.pyx                   |  7 ++--
 spacy/ml/tb_framework.py                    |  3 +-
 spacy/pipeline/_parser_internals/_state.pxd |  3 +-
 spacy/pipeline/dep_parser.pyx               |  3 +-
 spacy/pipeline/ner.pyx                      |  9 +++--
 spacy/pipeline/transition_parser.pxd        |  6 +--
 spacy/pipeline/transition_parser.pyx        | 45 +++++++++++++--------
 spacy/tests/parser/test_ner.py              |  1 -
 spacy/tests/test_misc.py                    | 20 ++++-----
 spacy/training/example.pyx                  |  4 +-
 12 files changed, 67 insertions(+), 48 deletions(-)

diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index a70d84dea8f..f6c0e565dd3 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,13 +1,14 @@
-from typing import Optional, List, cast
-from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
+from typing import List, Optional, cast
+
+from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
 from thinc.types import Floats2d
 
-from ...errors import Errors
 from ...compat import Literal
+from ...errors import Errors
+from ...tokens import Doc
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
-from ...tokens import Doc
 
 
 @registry.architectures("spacy.TransitionBasedParser.v2")
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
index 8def6cea53f..ca31c169964 100644
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@@ -1,7 +1,8 @@
-from libc.string cimport memset, memcpy
+from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport CBlas
-from ..typedefs cimport weight_t, hash_t
+
 from ..pipeline._parser_internals._state cimport StateC
+from ..typedefs cimport hash_t, weight_t
 
 
 cdef struct SizesC:
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 91558683b60..90e836f8a0a 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -1,18 +1,19 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
 cimport numpy as np
 from libc.math cimport exp
-from libc.string cimport memset, memcpy
 from libc.stdlib cimport calloc, free, realloc
+from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport saxpy, sgemm
 
 import numpy
 import numpy.random
-from thinc.api import Model, CupyOps, NumpyOps, get_ops
+from thinc.api import CupyOps, Model, NumpyOps, get_ops
 
 from .. import util
 from ..errors import Errors
-from ..typedefs cimport weight_t, class_t, hash_t
+
 from ..pipeline._parser_internals.stateclass cimport StateClass
+from ..typedefs cimport class_t, hash_t, weight_t
 
 
 cdef WeightsC get_c_weights(model) except *:
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
index ab4a969e24e..e351ad4e570 100644
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@@ -1,6 +1,7 @@
 from thinc.api import Model, noop
-from .parser_model import ParserStepModel
+
 from ..util import registry
+from .parser_model import ParserStepModel
 
 
 @registry.layers("spacy.TransitionModel.v1")
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 04274ce8af1..c063cf97cd4 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,4 +1,5 @@
 cimport libcpp
+from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as incr
 from libc.stdint cimport uint32_t, uint64_t
@@ -7,8 +8,6 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.set cimport set
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 3e59deaae4f..1fdc55dab41 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -5,8 +5,9 @@ from typing import Callable, Iterable, Optional
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
+
 from ._parser_internals.arc_eager cimport ArcEager
+from .transition_parser cimport Parser
 
 from ..language import Language
 from ..scorer import Scorer
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index c73ec5c528a..29f22b0174d 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -10,13 +10,14 @@ from ..training import remove_bilu_prefix, validate_examples
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
+
 from ._parser_internals.ner cimport BiluoPushDown
+from .transition_parser cimport Parser
+
 from ..language import Language
-from ..scorer import get_ner_prf, PRFScore
-from ..util import registry
+from ..scorer import PRFScore, get_ner_prf
 from ..training import remove_bilu_prefix
-
+from ..util import registry
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index f20e69a6e56..a48d76b6819 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -1,11 +1,11 @@
 from cymem.cymem cimport Pool
 from thinc.backends.cblas cimport CBlas
 
+from ..ml.parser_model cimport ActivationsC, SizesC, WeightsC
 from ..vocab cimport Vocab
-from .trainable_pipe cimport TrainablePipe
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
 from ._parser_internals._state cimport StateC
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from .trainable_pipe cimport TrainablePipe
 
 
 cdef class Parser(TrainablePipe):
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 4290420c788..2fb3af44ddf 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,21 +1,20 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
+
 from typing import Dict, Iterable, List, Optional, Tuple
-from cymem.cymem cimport Pool
+
 cimport numpy as np
+from cymem.cymem cimport Pool
+
 from itertools import islice
-from libcpp.vector cimport vector
-from libc.string cimport memset, memcpy
+
 from libc.stdlib cimport calloc, free
+from libc.string cimport memcpy, memset
+from libcpp.vector cimport vector
+
 import random
 
-import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
-from thinc.api import chain, softmax_activation, use_ops
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d
-import numpy.random
 import numpy
 import numpy.random
 import srsly
@@ -23,21 +22,36 @@ from thinc.api import (
     CupyOps,
     NumpyOps,
     Optimizer,
+    chain,
     get_array_module,
     get_ops,
     set_dropout_rate,
+    softmax_activation,
+    use_ops,
 )
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 
-from ._parser_internals.stateclass cimport StateClass
-from ._parser_internals.search cimport Beam
-from ..ml.parser_model cimport alloc_activations, free_activations
-from ..ml.parser_model cimport predict_states, arg_max_if_valid
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
-from ..ml.parser_model cimport get_c_weights, get_c_sizes
+from ..ml.parser_model cimport (
+    ActivationsC,
+    SizesC,
+    WeightsC,
+    alloc_activations,
+    arg_max_if_valid,
+    cpu_log_loss,
+    free_activations,
+    get_c_sizes,
+    get_c_weights,
+    predict_states,
+)
 from ..tokens.doc cimport Doc
+from ._parser_internals.search cimport Beam
+from ._parser_internals.stateclass cimport StateClass
+
 from .trainable_pipe import TrainablePipe
+
 from ._parser_internals cimport _beam_utils
+
 from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
@@ -62,7 +76,6 @@ cdef extern from "<algorithm>" namespace "std" nogil:
     bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
 
 
-
 NUMPY_OPS = NumpyOps()
 
 
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 5d5fbb02790..b6848d380fe 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,7 +17,6 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
-import logging
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 44717f6eba2..d2a41ff0fed 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,6 +1,7 @@
 import ctypes
 import os
 from pathlib import Path
+
 import pytest
 
 try:
@@ -8,15 +9,16 @@
 except ImportError:
     from pydantic import ValidationError  # type: ignore
 
-from spacy.about import __version__ as spacy_version
-from spacy import util
-from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.ml._precomputable_affine import PrecomputableAffine
-from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
-from spacy.util import dot_to_object, SimpleFrozenList, import_file
-from spacy.util import to_ternary_int, find_available_port
-from thinc.api import Config, Optimizer, ConfigValidationError
-from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
+from thinc.api import (
+    Config,
+    ConfigValidationError,
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+    set_current_ops,
+)
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index d1b69e1816a..b670e0f41f4 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,6 +1,6 @@
-# cython: profile=False
-from collections.abc import Iterable as IterableInstance
 import warnings
+from collections.abc import Iterable as IterableInstance
+
 import numpy
 
 from murmurhash.mrmr cimport hash64

From 19188c18e222bb8b5bc662baea0f70b27518ea58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:38:01 +0100
Subject: [PATCH 309/504] Add distillation tests with max cut size

And fix endless loop when the max cut size is 0 or 1.
---
 spacy/pipeline/transition_parser.pyx | 2 +-
 spacy/tests/parser/test_ner.py       | 5 ++++-
 spacy/tests/parser/test_parse.py     | 5 ++++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 2fb3af44ddf..17a4fdb1b93 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -302,7 +302,7 @@ cdef class Parser(TrainablePipe):
             # batch uniform length. Since we do not have a gold standard
             # sequence, we use the teacher's predictions as the gold
             # standard.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
             states = self._init_batch(teacher_step_model, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index b6848d380fe..7c3a9d56249 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -624,7 +624,9 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -642,6 +644,7 @@ def test_distill():
 
     student = English()
     student_ner = student.add_pipe("ner")
+    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 42cf5ced998..dbede7edd52 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -402,7 +402,9 @@ def test_is_distillable():
     assert parser.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []
@@ -420,6 +422,7 @@ def test_distill():
 
     student = English()
     student_parser = student.add_pipe("parser")
+    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
     student_parser.initialize(
         get_examples=lambda: train_examples, labels=teacher_parser.label_data
     )

From b836e4db139ac58c73e7f91e192c877e9df16ff6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 18 Dec 2023 20:02:15 +0100
Subject: [PATCH 310/504] Fix Cython lints

---
 spacy/ml/parser_model.pxd                |  9 ++--
 spacy/ml/parser_model.pyx                | 64 ++++++++++++------------
 spacy/pipeline/_parser_internals/ner.pyx |  4 +-
 spacy/pipeline/dep_parser.pyx            |  1 +
 spacy/pipeline/ner.pyx                   |  3 +-
 spacy/pipeline/transition_parser.pxd     |  4 +-
 spacy/pipeline/transition_parser.pyx     | 42 ++++++----------
 spacy/training/example.pyx               |  1 -
 8 files changed, 58 insertions(+), 70 deletions(-)

diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
index ca31c169964..88386255147 100644
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@@ -41,10 +41,9 @@ cdef ActivationsC alloc_activations(SizesC n) nogil
 cdef void free_activations(const ActivationsC* A) nogil
 
 cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil
- 
+                         const WeightsC* W, SizesC n) nogil
+
 cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
 
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores, int O) nogil
- 
+cdef void cpu_log_loss(float* d_scores, const float* costs,
+                       const int* is_valid, const float* scores, int O) nogil
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 90e836f8a0a..843275f4c8b 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -13,7 +13,7 @@ from .. import util
 from ..errors import Errors
 
 from ..pipeline._parser_internals.stateclass cimport StateClass
-from ..typedefs cimport class_t, hash_t, weight_t
+from ..typedefs cimport weight_t
 
 
 cdef WeightsC get_c_weights(model) except *:
@@ -78,31 +78,31 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
         A._max_size = n.states
     else:
         A.token_ids = <int*>realloc(A.token_ids,
-            n.states * n.feats * sizeof(A.token_ids[0]))
+                                    n.states * n.feats * sizeof(A.token_ids[0]))
         A.scores = <float*>realloc(A.scores,
-            n.states * n.classes * sizeof(A.scores[0]))
+                                   n.states * n.classes * sizeof(A.scores[0]))
         A.unmaxed = <float*>realloc(A.unmaxed,
-            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
         A.hiddens = <float*>realloc(A.hiddens,
-            n.states * n.hiddens * sizeof(A.hiddens[0]))
+                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
         A.is_valid = <int*>realloc(A.is_valid,
-            n.states * n.classes * sizeof(A.is_valid[0]))
+                                   n.states * n.classes * sizeof(A.is_valid[0]))
         A._max_size = n.states
     A._curr_size = n.states
 
 
 cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil:
-    cdef double one = 1.0
+                         const WeightsC* W, SizesC n) nogil:
     resize_activations(A, n)
     for i in range(n.states):
         states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
     memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
     memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
-    sum_state_features(cblas, A.unmaxed,
-        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
+    sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n.states,
+                       n.feats, n.hiddens * n.pieces)
     for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1,
+                     &A.unmaxed[i*n.hiddens*n.pieces], 1)
         for j in range(n.hiddens):
             index = i * n.hiddens * n.pieces + j * n.pieces
             which = _arg_max(&A.unmaxed[index], n.pieces)
@@ -112,10 +112,10 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
         memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
     else:
         # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-            1.0, <const float *>A.hiddens, n.hiddens,
-            <const float *>W.hidden_weights, n.hiddens,
-            0.0, A.scores, n.classes)
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens, 1.0,
+                     <const float *>A.hiddens, n.hiddens,
+                     <const float *>W.hidden_weights, n.hiddens, 0.0,
+                     A.scores, n.classes)
         # Add bias
         for i in range(n.states):
             saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
@@ -131,9 +131,9 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
                 A.scores[i*n.classes+j] = min_
 
 
-cdef void sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, int B, int F, int O) nogil:
-    cdef int idx, b, f, i
+cdef void sum_state_features(CBlas cblas, float* output, const float* cached,
+                             const int* token_ids, int B, int F, int O) nogil:
+    cdef int idx, b, f
     cdef const float* feature
     padding = cached
     cached += F * O
@@ -150,9 +150,8 @@ cdef void sum_state_features(CBlas cblas, float* output,
         token_ids += F
 
 
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores,
-        int O) nogil:
+cdef void cpu_log_loss(float* d_scores, const float* costs, const int* is_valid,
+                       const float* scores, int O) nogil:
     """Do multi-label log loss"""
     cdef double max_, gmax, Z, gZ
     best = arg_max_if_gold(scores, costs, is_valid, O)
@@ -178,7 +177,7 @@ cdef void cpu_log_loss(float* d_scores,
 
 
 cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
-        const int* is_valid, int n) nogil:
+                         const int* is_valid, int n) nogil:
     # Find minimum cost
     cdef float cost = 1
     for i in range(n):
@@ -202,10 +201,9 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
     return best
 
 
-
 class ParserStepModel(Model):
     def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
-            dropout=0.1):
+                 dropout=0.1):
         Model.__init__(self, name="parser_step_model", forward=step_forward)
         self.attrs["has_upper"] = has_upper
         self.attrs["dropout_rate"] = dropout
@@ -267,7 +265,7 @@ class ParserStepModel(Model):
 
     def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
         if isinstance(self.state2vec.ops, CupyOps) \
-        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+           and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
             # Move token_ids and d_vector to GPU, asynchronously
             self.backprops.append((
                 util.get_async(self.cuda_stream, token_ids),
@@ -277,7 +275,6 @@ class ParserStepModel(Model):
         else:
             self.backprops.append((token_ids, d_vector, get_d_tokvecs))
 
-
     def finish_steps(self, golds):
         # Add a padding vector to the d_tokvecs gradient, so that missing
         # values don't affect the real gradient.
@@ -290,14 +287,15 @@ class ParserStepModel(Model):
             ids = ids.flatten()
             d_state_features = d_state_features.reshape(
                 (ids.size, d_state_features.shape[2]))
-            self.ops.scatter_add(d_tokvecs, ids,
-                d_state_features)
+            self.ops.scatter_add(d_tokvecs, ids, d_state_features)
         # Padded -- see update()
         self.bp_tokvecs(d_tokvecs[:-1])
         return d_tokvecs
 
+
 NUMPY_OPS = NumpyOps()
 
+
 def step_forward(model: ParserStepModel, states, is_train):
     token_ids = model.get_token_ids(states)
     vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
@@ -310,7 +308,7 @@ def step_forward(model: ParserStepModel, states, is_train):
         scores, get_d_vector = model.vec2scores(vector, is_train)
     else:
         scores = NumpyOps().asarray(vector)
-        get_d_vector = lambda d_scores: d_scores
+        def get_d_vector(d_scores): return d_scores
     # If the class is unseen, make sure its score is minimum
     scores[:, model._class_mask == 0] = numpy.nanmin(scores)
 
@@ -445,8 +443,8 @@ cdef class precompute_hiddens:
         feat_weights = self.get_feat_weights()
         cdef int[:, ::1] ids = token_ids
         sum_state_features(cblas, <float*>state_vector.data,
-            feat_weights, &ids[0,0],
-            token_ids.shape[0], self.nF, self.nO*self.nP)
+                           feat_weights, &ids[0, 0], token_ids.shape[0],
+                           self.nF, self.nO*self.nP)
         state_vector += self.bias
         state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
 
@@ -471,7 +469,7 @@ cdef class precompute_hiddens:
 
         def backprop_maxout(d_best):
             return self.ops.backprop_maxout(d_best, mask, self.nP)
-        
+
         return state_vector, backprop_maxout
 
     def _relu_nonlinearity(self, state_vector):
@@ -485,7 +483,7 @@ cdef class precompute_hiddens:
         def backprop_relu(d_best):
             d_best *= mask
             return d_best.reshape((d_best.shape + (1,)))
- 
+
         return state_vector, backprop_relu
 
 cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 0b9980ddbf2..be769bd9cd0 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -159,7 +159,7 @@ cdef class BiluoPushDown(TransitionSystem):
             if token.ent_type:
                 labels.add(token.ent_type_)
         return labels
-    
+
     def move_name(self, int move, attr_t label):
         if move == OUT:
             return 'O'
@@ -644,7 +644,7 @@ cdef class Unit:
                 cost += 1
                 break
         return cost
- 
+
 
 
 cdef class Out:
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 1fdc55dab41..cbd7187ff0f 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -127,6 +127,7 @@ def make_parser(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_parser",
     assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 29f22b0174d..fe54d33a17b 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -15,7 +15,7 @@ from ._parser_internals.ner cimport BiluoPushDown
 from .transition_parser cimport Parser
 
 from ..language import Language
-from ..scorer import PRFScore, get_ner_prf
+from ..scorer import get_ner_prf
 from ..training import remove_bilu_prefix
 from ..util import registry
 
@@ -105,6 +105,7 @@ def make_ner(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_ner",
     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index a48d76b6819..7adb82213de 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -15,7 +15,7 @@ cdef class Parser(TrainablePipe):
     cdef object _cpu_ops
 
     cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil
+                      WeightsC weights, SizesC sizes) nogil
 
     cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil
+                                 int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 17a4fdb1b93..fa9a76772ec 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -10,7 +10,7 @@ from cymem.cymem cimport Pool
 from itertools import islice
 
 from libc.stdlib cimport calloc, free
-from libc.string cimport memcpy, memset
+from libc.string cimport memset
 from libcpp.vector cimport vector
 
 import random
@@ -23,14 +23,13 @@ from thinc.api import (
     NumpyOps,
     Optimizer,
     chain,
-    get_array_module,
     get_ops,
     set_dropout_rate,
     softmax_activation,
     use_ops,
 )
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
+from thinc.types import Floats2d
 
 from ..ml.parser_model cimport (
     ActivationsC,
@@ -45,7 +44,6 @@ from ..ml.parser_model cimport (
     predict_states,
 )
 from ..tokens.doc cimport Doc
-from ._parser_internals.search cimport Beam
 from ._parser_internals.stateclass cimport StateClass
 
 from .trainable_pipe import TrainablePipe
@@ -55,11 +53,10 @@ from ._parser_internals cimport _beam_utils
 from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
-from ..typedefs cimport weight_t
 from ..vocab cimport Vocab
 from ._parser_internals cimport _beam_utils
 from ._parser_internals.stateclass cimport StateC, StateClass
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ._parser_internals.transition_system cimport Transition
 from .trainable_pipe cimport TrainablePipe
 
 from .. import util
@@ -295,7 +292,7 @@ cdef class Parser(TrainablePipe):
         with use_ops("numpy"):
             teacher_model = chain(teacher_step_model, softmax_activation())
             student_model = chain(student_step_model, softmax_activation())
-        
+
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
@@ -440,8 +437,6 @@ cdef class Parser(TrainablePipe):
         return batch
 
     def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
-        cdef Beam beam
-        cdef Doc doc
         self._ensure_labels_are_added(docs)
         batch = _beam_utils.BeamBatch(
             self.moves,
@@ -462,15 +457,15 @@ cdef class Parser(TrainablePipe):
         return list(batch)
 
     cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil:
-        cdef int i, j
+                      WeightsC weights, SizesC sizes) nogil:
+        cdef int i
         cdef vector[StateC*] unfinished
         cdef ActivationsC activations = alloc_activations(sizes)
         while sizes.states >= 1:
             predict_states(cblas, &activations, states, &weights, sizes)
             # Validate actions, argmax, take action.
-            self.c_transition_batch(states,
-                activations.scores, sizes.classes, sizes.states)
+            self.c_transition_batch(states, activations.scores,
+                                    sizes.classes, sizes.states)
             for i in range(sizes.states):
                 if not states[i].is_final():
                     unfinished.push_back(states[i])
@@ -499,7 +494,7 @@ cdef class Parser(TrainablePipe):
         return [state for state in states if not state.c.is_final()]
 
     cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil:
+                                 int nr_class, int batch_size) nogil:
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         with gil:
             assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
@@ -557,8 +552,7 @@ cdef class Parser(TrainablePipe):
         if not states:
             return losses
         model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
- 
-        all_states = list(states)
+
         states_golds = list(zip(states, golds))
         n_moves = 0
         while states_golds:
@@ -638,8 +632,8 @@ cdef class Parser(TrainablePipe):
         del tutor
         return losses
 
-    def update_beam(self, examples, *, beam_width,
-            drop=0., sgd=None, losses=None, beam_density=0.0):
+    def update_beam(self, examples, *, beam_width, drop=0., sgd=None,
+                    losses=None, beam_density=0.0):
         states, golds, _ = self.moves.init_gold_batch(examples)
         if not states:
             return losses
@@ -670,7 +664,7 @@ cdef class Parser(TrainablePipe):
         is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
         costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
         cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
-                                        dtype='f', order='C')
+                                               dtype='f', order='C')
         c_d_scores = <float*>d_scores.data
         unseen_classes = self.model.attrs["unseen_classes"]
         for i, (state, gold) in enumerate(zip(states, golds)):
@@ -680,8 +674,8 @@ cdef class Parser(TrainablePipe):
             for j in range(self.moves.n_moves):
                 if costs[j] <= 0.0 and j in unseen_classes:
                     unseen_classes.remove(j)
-            cpu_log_loss(c_d_scores,
-                costs, is_valid, &scores[i, 0], d_scores.shape[1])
+            cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0],
+                         d_scores.shape[1])
             c_d_scores += d_scores.shape[1]
         # Note that we don't normalize this. See comment in update() for why.
         if losses is not None:
@@ -791,10 +785,7 @@ cdef class Parser(TrainablePipe):
         long_doc[:N], and another representing long_doc[N:]. In contrast to
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
-        cdef:
-            StateClass start_state
-            StateClass state
-            Transition action
+        cdef StateClass state
         all_states = self.moves.init_batch(docs)
         states = []
         to_cut = []
@@ -816,7 +807,6 @@ cdef class Parser(TrainablePipe):
                 length += 1
         return states
 
-
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
         sequence or a cap. A long
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index b670e0f41f4..a5e93125610 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,4 +1,3 @@
-import warnings
 from collections.abc import Iterable as IterableInstance
 
 import numpy

From c4765ca4362c13056bcc8568ff20f7cf0ba34296 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 18 Dec 2023 20:17:24 +0100
Subject: [PATCH 311/504] Bring back W401

---
 spacy/errors.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index a5d0b3d11a9..5d6d65e3b26 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -217,6 +217,11 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
+    # v4 warning strings
+    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
+            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
+            "to return `True` in `.supports_prior_probs`.")
+
 
 class Errors(metaclass=ErrorsWithCodes):
     E001 = ("No component '{name}' found in pipeline. Available names: {opts}")

From 8a398739514bafeebb596e1defc6c4fb9cac2dc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 19 Dec 2023 09:28:20 +0100
Subject: [PATCH 312/504] Fix `TransitionBasedParser` version in transformer
 embeddings docs

---
 website/docs/usage/embeddings-transformers.mdx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 2bd2856b6a3..534cf478087 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -140,7 +140,7 @@ factory = "tok2vec"
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -156,7 +156,7 @@ same. This makes them fully independent and doesn't require an upstream
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
@@ -472,7 +472,7 @@ sneakily delegates to the `Transformer` pipeline component.
 factory = "ner"
 
 [nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 128

From 07f03d29e3c77a1413a6a497dbef128d4503e934 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 09:47:38 +0100
Subject: [PATCH 313/504] No need for `Literal` compat, since we only support
 >= 3.8

---
 spacy/compat.py           | 5 -----
 spacy/errors.py           | 1 -
 spacy/ml/models/parser.py | 3 +--
 3 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/spacy/compat.py b/spacy/compat.py
index 30459e2e495..1e63807a0e8 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,11 +23,6 @@
 except ImportError:
     cupy = None
 
-if sys.version_info[:2] >= (3, 8):  # Python 3.8+
-    from typing import Literal, Protocol, runtime_checkable
-else:
-    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
-
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/errors.py b/spacy/errors.py
index 5d6d65e3b26..1af8a3b0891 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1006,7 +1006,6 @@ class Errors(metaclass=ErrorsWithCodes):
     E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
-
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
 # fmt: on
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index f6c0e565dd3..e776174f6ed 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,9 +1,8 @@
-from typing import List, Optional, cast
+from typing import List, Literal, Optional
 
 from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
 from thinc.types import Floats2d
 
-from ...compat import Literal
 from ...errors import Errors
 from ...tokens import Doc
 from ...util import registry

From 54bfd06b4e7f0a6e0b9c798fe472c18c27de361c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 10:06:28 +0100
Subject: [PATCH 314/504] Fix parser distillation test seed

The test would sometimes fail. Rather than increasing test by increasing
training iterations, use a known-good seed.
---
 spacy/tests/parser/test_parse.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index dbede7edd52..f63d56f6922 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -405,6 +405,7 @@ def test_is_distillable():
 @pytest.mark.slow
 @pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
 def test_distill(max_moves):
+    fix_random_seed(0)
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []

From 22e7b671b5b1af1692cd16420f7e253a6bf6333d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 11:14:35 +0100
Subject: [PATCH 315/504] TransitionBasedParser.v2 in run example output

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/docs/api/cli.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 1fae1dc6cda..cfa99a2b350 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -406,7 +406,7 @@ Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v2
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
@@ -416,7 +416,7 @@ Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v2
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]

From 233143bb0704719431d914b1b6e73dd3e9e18fdf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 16 Jan 2024 14:54:26 +0100
Subject: [PATCH 316/504] Update thinc dependency to 9.0.0.dev4

---
 pyproject.toml                         |  2 +-
 requirements.txt                       |  2 +-
 setup.cfg                              | 10 +++++++++-
 spacy/pipeline/edit_tree_lemmatizer.py |  7 ++-----
 spacy/pipeline/morphologizer.pyx       |  7 +++----
 spacy/pipeline/senter.pyx              |  7 ++-----
 spacy/pipeline/tagger.pyx              | 13 ++++++++-----
 spacy/pipeline/transition_parser.pyx   |  4 ++--
 8 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3891d137867..0a5bc162773 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev2,<9.1.0",
+    "thinc>=9.0.0.dev4,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 29420430aab..e82f28055bb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev0,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev2,<9.1.0
+thinc>=9.0.0.dev4,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 3a84f37d3bf..223c63dd6da 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,6 +30,14 @@ project_urls =
 zip_safe = false
 include_package_data = true
 python_requires = >=3.8
+setup_requires =
+    cython>=0.25,<3.0
+    numpy>=1.15.0
+    # We also need our Cython packages here to compile against
+    cymem>=2.0.2,<2.1.0
+    preshed>=3.0.2,<3.1.0
+    murmurhash>=0.28.0,<1.1.0
+    thinc>=9.0.0.dev4,<9.1.0
 install_requires =
     # Our libraries
     spacy-legacy>=4.0.0.dev0,<4.1.0
@@ -37,7 +45,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev2,<9.1.0
+    thinc>=9.0.0.dev4,<9.1.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 046ef19c3d5..1a29735e8e8 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -5,7 +5,6 @@
 import numpy as np
 import srsly
 from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
@@ -128,9 +127,7 @@ def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
     ) -> Tuple[float, List[Floats2d]]:
         validate_examples(examples, "EditTreeLemmatizer.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(
-            normalize=False, missing_value=-1
-        )
+        loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
 
         truths = []
         for eg in examples:
@@ -166,7 +163,7 @@ def get_teacher_student_loss(
 
         DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 669a5424412..0f77326e67d 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -2,9 +2,7 @@
 from itertools import islice
 from typing import Callable, Dict, Iterable, Optional, Union
 
-from thinc.api import Config, Model
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 
 from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
@@ -296,7 +294,8 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
+                                                    label_smoothing=self.cfg["label_smoothing"])
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 42615e194e0..51670dcf8cf 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -2,10 +2,7 @@
 from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Union
 
-import srsly
-from thinc.api import Config, Model
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 
 from ..tokens.doc cimport Doc
 
@@ -160,7 +157,7 @@ class SentenceRecognizer(Tagger):
         """
         validate_examples(examples, "SentenceRecognizer.get_loss")
         labels = self.labels
-        loss_func = LegacySequenceCategoricalCrossentropy(names=labels, normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
         truths = []
         for eg in examples:
             eg_truth = []
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index f3d0527ea0b..21c7b3ab0a3 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -4,9 +4,7 @@ from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy
-import srsly
-from thinc.api import Config, Model, set_dropout_rate
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
 from thinc.types import Floats2d, Ints1d
 
 from ..morphology cimport Morphology
@@ -275,7 +273,7 @@ class Tagger(TrainablePipe):
         
         DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))
@@ -292,7 +290,12 @@ class Tagger(TrainablePipe):
         DOCS: https://spacy.io/api/tagger#get_loss
         """
         validate_examples(examples, "Tagger.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
+        loss_func = SequenceCategoricalCrossentropy(
+            names=self.labels,
+            normalize=False,
+            neg_prefix=self.cfg["neg_prefix"],
+            label_smoothing=self.cfg["label_smoothing"]
+        )
         # Convert empty tag "" to missing value None so that both misaligned
         # tokens and tokens with missing annotation have the default missing
         # value None.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index fa9a76772ec..c728f1b7909 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -22,13 +22,13 @@ from thinc.api import (
     CupyOps,
     NumpyOps,
     Optimizer,
+    SequenceCategoricalCrossentropy,
     chain,
     get_ops,
     set_dropout_rate,
     softmax_activation,
     use_ops,
 )
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d
 
 from ..ml.parser_model cimport (
@@ -355,7 +355,7 @@ cdef class Parser(TrainablePipe):
         
         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))

From cbd36008c57863e9d531aba4d1a329db18388310 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 17 Jan 2024 09:53:01 +0100
Subject: [PATCH 317/504] Temporily xfail local remote storage test

---
 spacy/tests/test_cli.py | 61 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index a47f03e8ab4..c9e823ffe68 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -878,6 +878,67 @@ def test_applycli_user_data():
         assert result[0]._.ext == val
 
 
+# TODO: remove this xfail after merging master into v4. The issue
+#       is that for local files, pathy started returning os.stat_result,
+#       which doesn't have a last_modified property. So, recency-sorting
+#       fails and the test fails. However, once we merge master into
+#       v4, we'll use weasel, which in turn uses cloudpathlib, which
+#       should resolve this issue.
+@pytest.mark.xfail(reason="Recency sorting is broken on some platforms")
+def test_local_remote_storage():
+    with make_tempdir() as d:
+        filename = "a.txt"
+
+        content_hashes = ("aaaa", "cccc", "bbbb")
+        for i, content_hash in enumerate(content_hashes):
+            # make sure that each subsequent file has a later timestamp
+            if i > 0:
+                time.sleep(1)
+            content = f"{content_hash} content"
+            loc_file = d / "root" / filename
+            if not loc_file.parent.exists():
+                loc_file.parent.mkdir(parents=True)
+            with loc_file.open(mode="w") as file_:
+                file_.write(content)
+
+            # push first version to remote storage
+            remote = RemoteStorage(d / "root", str(d / "remote"))
+            remote.push(filename, "aaaa", content_hash)
+
+            # retrieve with full hashes
+            loc_file.unlink()
+            remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
+            with loc_file.open(mode="r") as file_:
+                assert file_.read() == content
+
+            # retrieve with command hash
+            loc_file.unlink()
+            remote.pull(filename, command_hash="aaaa")
+            with loc_file.open(mode="r") as file_:
+                assert file_.read() == content
+
+            # retrieve with content hash
+            loc_file.unlink()
+            remote.pull(filename, content_hash=content_hash)
+            with loc_file.open(mode="r") as file_:
+                assert file_.read() == content
+
+            # retrieve with no hashes
+            loc_file.unlink()
+            remote.pull(filename)
+            with loc_file.open(mode="r") as file_:
+                assert file_.read() == content
+
+
+def test_local_remote_storage_pull_missing():
+    # pulling from a non-existent remote pulls nothing gracefully
+    with make_tempdir() as d:
+        filename = "a.txt"
+        remote = RemoteStorage(d / "root", str(d / "remote"))
+        assert remote.pull(filename, command_hash="aaaa") is None
+        assert remote.pull(filename) is None
+
+
 def test_cli_find_threshold(capsys):
     def make_examples(nlp: Language) -> List[Example]:
         docs: List[Example] = []

From fb9410060e4761ea6e53b69d4483a6d13e8cb123 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 30 Oct 2023 17:02:08 +0100
Subject: [PATCH 318/504] Add note in docs on `score_weight` config if using a
 non-default `spans_key` for SpanCat (#13093)

* Add note on score_weight if using a non-default span_key for SpanCat.

* Fix formatting.

* Fix formatting.

* Fix typo.

* Use warning infobox.

* Fix infobox formatting.
---
 website/docs/api/spancategorizer.mdx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx
index 258db794786..33219751ca6 100644
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@@ -75,8 +75,7 @@ architectures and their arguments and hyperparameters.
 <Infobox variant="warning">
 
 If you set a non-default value for `spans_key`, you'll have to update
-`[training.score_weights]` as well so that weights are computed properly. E. g.
-for `spans_key == "myspankey"`, include this in your config:
+`[training.score_weights]` as well so that weights are computed properly. E. g. for `span_key == "myspankey"`, include this in your config:
 
 ```ini
 [training.score_weights]

From 2f6d3d5d792eef339b490bde61b5c2f82521b3e6 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 10 Nov 2023 08:05:07 +0100
Subject: [PATCH 319/504] Warn about reloading dependencies after downloading
 models (#13081)

* Update the "Missing factory" error message

This accounts for model installations that took place during the current Python session.

* Add a note about Jupyter notebooks

* Move error to `spacy.cli.download`
Add extra message for Jupyter sessions

* Add additional note for interactive sessions

* Remove note about `spacy-transformers` from error message

* `isort`

* Improve checks for colab (also helps displacy)

* Update warning messages

* Improve flow for multiple checks

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/download.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0635522930b..5e460717cc4 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,10 +7,11 @@
 from wasabi import msg
 
 from .. import about
+from ..errors import OLD_MODEL_SHORTCUTS
 from ..util import (
-    get_installed_models,
     get_minor_version,
-    get_package_version,
+    is_in_interactive,
+    is_in_jupyter,
     is_package,
     is_prerelease_version,
     run_command,

From ee2cf7b25b66acb4260a99abd1366ed452687a18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 29 Nov 2023 09:11:54 +0100
Subject: [PATCH 320/504] Update `TextCatBOW` to use the fixed `SparseLinear`
 layer (#13149)

* Update `TextCatBOW` to use the fixed `SparseLinear` layer

A while ago, we fixed the `SparseLinear` layer to use all available
parameters: https://github.com/explosion/thinc/pull/754

This change updates `TextCatBOW` to `v3` which uses the new
`SparseLinear_v2` layer. This results in a sizeable improvement on a
text categorization task that was tested.

While at it, this `spacy.TextCatBOW.v3` also adds the `length_exponent`
option to make it possible to change the hidden size. Ideally, we'd just
have an option called `length`. But the way that `TextCatBOW` uses
hashes results in a non-uniform distribution of parameters when the
length is not a power of two.

* Replace TexCatBOW `length_exponent` parameter by `length`

We now round up the length to the next power of two if it isn't
a power of two.

* Remove some tests for TextCatBOW.v2

* Fix missing import
---
 spacy/errors.py                      |  3 ---
 spacy/tests/pipeline/test_textcat.py |  8 +++---
 website/docs/api/architectures.mdx   | 40 ++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 1af8a3b0891..571335009be 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -974,9 +974,6 @@ class Errors(metaclass=ErrorsWithCodes):
     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
              "but only callbacks with one or three parameters are supported")
     E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
-    E1057 = ("The `TextCatReduce` architecture must be used with at least one "
-             "reduction. Please enable one of `use_reduce_first`, "
-             "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 3f2d757eebc..3653739befd 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -499,9 +499,9 @@ def test_resize(name, textcat_config):
         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
-        # REDUCE
-        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        # CNN
+        ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
     ],
 )
 # fmt: on
@@ -749,7 +749,7 @@ def test_overfitting_IO_multi():
         # ENSEMBLE V2
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
-        # CNN V2 (legacy)
+        # CNN V2
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
         # PARAMETRIC ATTENTION V1
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 956234ac0d4..31beb15644c 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -1020,6 +1020,46 @@ but used an internal `tok2vec` instead of taking it as argument:
 
 ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
 
+> #### Example Config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.TextCatCNN.v2"
+> exclusive_classes = false
+> nO = null
+>
+> [model.tok2vec]
+> @architectures = "spacy.HashEmbedCNN.v2"
+> pretrained_vectors = null
+> width = 96
+> depth = 4
+> embed_size = 2000
+> window_size = 1
+> maxout_pieces = 3
+> subword_features = true
+> ```
+
+A neural network model where token vectors are calculated using a CNN. The
+vectors are mean pooled and used as features in a feed-forward network. This
+architecture is usually less accurate than the ensemble, but runs faster.
+
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
+
+<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
+
+[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
+not yet resizable. Since v2, new labels can be added to this component, even
+after training.
+
+</Accordion>
+
+### spacy.TextCatBOW.v3 {id="TextCatBOW"}
+
 > #### Example Config
 >
 > ```ini

From 2764686f256ac6a980aed1673ddc4341abb5951b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 11:00:06 +0100
Subject: [PATCH 321/504] Add TextCatReduce.v1 (#13181)

* Add TextCatReduce.v1

This is a textcat classifier that pools the vectors generated by a
tok2vec implementation and then applies a classifier to the pooled
representation. Three reductions are supported for pooling: first, max,
and mean. When multiple reductions are enabled, the reductions are
concatenated before providing them to the classification layer.

This model is a generalization of the TextCatCNN model, which only
supports mean reductions and is a bit of a misnomer, because it can also
be used with transformers. This change also reimplements TextCatCNN.v2
using the new TextCatReduce.v1 layer.

* Doc fixes

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fully specify `TextCatCNN` <-> `TextCatReduce` equivalence

* Move TextCatCNN docs to legacy, in prep for moving to spacy-legacy

* Add back a test for TextCatCNN.v2

* Replace TextCatCNN in pipe configurations and templates

* Add an infobox to the `TextCatReduce` section with an `TextCatCNN` anchor

* Add last reduction (`use_reduce_last`)

* Remove non-working TextCatCNN Netlify redirect

* Revert layer changes for the quickstart

* Revert one more quickstart change

* Remove unused import

* Fix docstring

* Fix setting name in error message

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/errors.py                      |  3 +
 spacy/ml/models/textcat.py           | 85 ++++------------------------
 spacy/tests/pipeline/test_textcat.py | 13 ++---
 website/docs/api/architectures.mdx   | 78 -------------------------
 4 files changed, 21 insertions(+), 158 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 571335009be..1af8a3b0891 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -974,6 +974,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
              "but only callbacks with one or three parameters are supported")
     E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
+    E1057 = ("The `TextCatReduce` architecture must be used with at least one "
+             "reduction. Please enable one of `use_reduce_first`, "
+             "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 601c94a7f0a..1a49bac1d9d 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -22,6 +22,9 @@
     reduce_first,
     reduce_last,
     reduce_max,
+    reduce_first,
+    reduce_last,
+    reduce_max,
     reduce_mean,
     reduce_sum,
     residual,
@@ -63,6 +66,15 @@ def build_simple_cnn_text_classifier(
         use_reduce_mean=True,
         nO=nO,
     )
+    return build_reduce_text_classifier(
+        tok2vec=tok2vec,
+        exclusive_classes=exclusive_classes,
+        use_reduce_first=False,
+        use_reduce_last=False,
+        use_reduce_max=False,
+        use_reduce_mean=True,
+        nO=nO,
+    )
 
 
 def resize_and_set_ref(model, new_nO, resizable_layer):
@@ -221,79 +233,6 @@ def build_text_classifier_lowdata(
     return model
 
 
-@registry.architectures("spacy.TextCatParametricAttention.v1")
-def build_textcat_parametric_attention_v1(
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    exclusive_classes: bool,
-    nO: Optional[int] = None,
-) -> Model[List[Doc], Floats2d]:
-    width = tok2vec.maybe_get_dim("nO")
-    parametric_attention = _build_parametric_attention_with_residual_nonlinear(
-        tok2vec=tok2vec,
-        nonlinear_layer=Maxout(nI=width, nO=width),
-        key_transform=Gelu(nI=width, nO=width),
-    )
-    with Model.define_operators({">>": chain}):
-        if exclusive_classes:
-            output_layer = Softmax(nO=nO)
-        else:
-            output_layer = Linear(nO=nO) >> Logistic()
-        model = parametric_attention >> output_layer
-    if model.has_dim("nO") is not False and nO is not None:
-        model.set_dim("nO", cast(int, nO))
-    model.set_ref("output_layer", output_layer)
-    model.attrs["multi_label"] = not exclusive_classes
-
-    return model
-
-
-def _build_parametric_attention_with_residual_nonlinear(
-    *,
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    nonlinear_layer: Model[Floats2d, Floats2d],
-    key_transform: Optional[Model[Floats2d, Floats2d]] = None,
-) -> Model[List[Doc], Floats2d]:
-    with Model.define_operators({">>": chain, "|": concatenate}):
-        width = tok2vec.maybe_get_dim("nO")
-        attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform)
-        norm_layer = LayerNorm(nI=width)
-        parametric_attention = (
-            tok2vec
-            >> list2ragged()
-            >> attention_layer
-            >> reduce_sum()
-            >> residual(nonlinear_layer >> norm_layer >> Dropout(0.0))
-        )
-
-        parametric_attention.init = _init_parametric_attention_with_residual_nonlinear
-
-        parametric_attention.set_ref("tok2vec", tok2vec)
-        parametric_attention.set_ref("attention_layer", attention_layer)
-        parametric_attention.set_ref("key_transform", key_transform)
-        parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
-        parametric_attention.set_ref("norm_layer", norm_layer)
-
-        return parametric_attention
-
-
-def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
-    # When tok2vec is lazily initialized, we need to initialize it before
-    # the rest of the chain to ensure that we can get its width.
-    tok2vec = model.get_ref("tok2vec")
-    tok2vec.initialize(X)
-
-    tok2vec_width = get_tok2vec_width(model)
-    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("key_transform").set_dim("nI", tok2vec_width)
-    model.get_ref("key_transform").set_dim("nO", tok2vec_width)
-    model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
-    init_chain(model, X, Y)
-    return model
-
-
 @registry.architectures("spacy.TextCatReduce.v1")
 def build_reduce_text_classifier(
     tok2vec: Model,
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 3653739befd..9ee93af0fef 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -473,6 +473,8 @@ def test_no_resize(name, textcat_config):
         # CNN
         ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
     ],
 )
 # fmt: on
@@ -499,9 +501,9 @@ def test_resize(name, textcat_config):
         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
-        # CNN
-        ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
+        # REDUCE
+        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
     ],
 )
 # fmt: on
@@ -749,12 +751,9 @@ def test_overfitting_IO_multi():
         # ENSEMBLE V2
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
-        # CNN V2
+        # CNN V2 (legacy)
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
-        # PARAMETRIC ATTENTION V1
-        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
-        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
         # REDUCE V1
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 31beb15644c..63f723a28cf 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -1020,46 +1020,6 @@ but used an internal `tok2vec` instead of taking it as argument:
 
 ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
 
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.TextCatCNN.v2"
-> exclusive_classes = false
-> nO = null
->
-> [model.tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v2"
-> pretrained_vectors = null
-> width = 96
-> depth = 4
-> embed_size = 2000
-> window_size = 1
-> maxout_pieces = 3
-> subword_features = true
-> ```
-
-A neural network model where token vectors are calculated using a CNN. The
-vectors are mean pooled and used as features in a feed-forward network. This
-architecture is usually less accurate than the ensemble, but runs faster.
-
-| Name                | Description                                                                                                                                                                                    |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
-
-<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
-
-[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
-not yet resizable. Since v2, new labels can be added to this component, even
-after training.
-
-</Accordion>
-
-### spacy.TextCatBOW.v3 {id="TextCatBOW"}
-
 > #### Example Config
 >
 > ```ini
@@ -1096,44 +1056,6 @@ the others, but may not be as accurate, especially if texts are short.
 
 </Accordion>
 
-### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"}
-
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.TextCatParametricAttention.v1"
-> exclusive_classes = true
-> nO = null
->
-> [model.tok2vec]
-> @architectures = "spacy.Tok2Vec.v2"
->
-> [model.tok2vec.embed]
-> @architectures = "spacy.MultiHashEmbed.v2"
-> width = 64
-> rows = [2000, 2000, 1000, 1000, 1000, 1000]
-> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
-> include_static_vectors = false
->
-> [model.tok2vec.encode]
-> @architectures = "spacy.MaxoutWindowEncoder.v2"
-> width = ${model.tok2vec.embed.width}
-> window_size = 1
-> maxout_pieces = 3
-> depth = 2
-> ```
-
-A neural network model that is built upon Tok2Vec and uses parametric attention
-to attend to tokens that are relevant to text classification.
-
-| Name                | Description                                                                                                                                                                                    |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`           | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
-
 ### spacy.TextCatReduce.v1 {id="TextCatReduce"}
 
 > #### Example Config

From d8b8a12fce5ec2df4caf2dd48480201c7c2d392f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 2 Jan 2024 10:03:06 +0100
Subject: [PATCH 322/504] Add spacy.TextCatParametricAttention.v1 (#13201)

* Add spacy.TextCatParametricAttention.v1

This layer provides is a simplification of the ensemble classifier that
only uses paramteric attention. We have found empirically that with a
sufficient amount of training data, using the ensemble classifier with
BoW does not provide significant improvement in classifier accuracy.
However, plugging in a BoW classifier does reduce GPU training and
inference performance substantially, since it uses a GPU-only kernel.

* Fix merge fallout
---
 pyproject.toml                       |  5 ++-
 requirements.txt                     |  2 +-
 setup.cfg                            |  4 +-
 spacy/ml/models/textcat.py           | 65 ++++++++++++++++++++++++++++
 spacy/tests/pipeline/test_textcat.py |  3 ++
 website/docs/api/architectures.mdx   | 38 ++++++++++++++++
 6 files changed, 112 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0a5bc162773..bfd7e68d1f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,8 +5,9 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev4,<9.1.0",
-    "numpy>=1.15.0",
+    "thinc>=8.2.2,<8.3.0",
+    "numpy>=1.15.0; python_version < '3.9'",
+    "numpy>=1.25.0; python_version >= '3.9'",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index e82f28055bb..e99ebc90ab2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev0,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev4,<9.1.0
+thinc>=8.2.2,<8.3.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 223c63dd6da..b57fdc52bf9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,7 +37,7 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=9.0.0.dev4,<9.1.0
+    thinc>=8.2.2,<8.3.0
 install_requires =
     # Our libraries
     spacy-legacy>=4.0.0.dev0,<4.1.0
@@ -45,7 +45,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev4,<9.1.0
+    thinc>=8.2.2,<8.3.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 1a49bac1d9d..4b3d2de9171 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -233,6 +233,71 @@ def build_text_classifier_lowdata(
     return model
 
 
+@registry.architectures("spacy.TextCatParametricAttention.v1")
+def build_textcat_parametric_attention_v1(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    exclusive_classes: bool,
+    nO: Optional[int] = None,
+) -> Model[List[Doc], Floats2d]:
+    width = tok2vec.maybe_get_dim("nO")
+    parametric_attention = _build_parametric_attention_with_residual_nonlinear(
+        tok2vec=tok2vec,
+        nonlinear_layer=Maxout(nI=width, nO=width),
+        key_transform=Gelu(nI=width, nO=width),
+    )
+    with Model.define_operators({">>": chain}):
+        if exclusive_classes:
+            output_layer = Softmax(nO=nO)
+        else:
+            output_layer = Linear(nO=nO) >> Logistic()
+        model = parametric_attention >> output_layer
+    if model.has_dim("nO") is not False and nO is not None:
+        model.set_dim("nO", cast(int, nO))
+    model.set_ref("output_layer", output_layer)
+    model.attrs["multi_label"] = not exclusive_classes
+
+    return model
+
+
+def _build_parametric_attention_with_residual_nonlinear(
+    *,
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    nonlinear_layer: Model[Floats2d, Floats2d],
+    key_transform: Optional[Model[Floats2d, Floats2d]] = None,
+) -> Model[List[Doc], Floats2d]:
+    with Model.define_operators({">>": chain, "|": concatenate}):
+        width = tok2vec.maybe_get_dim("nO")
+        attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform)
+        norm_layer = LayerNorm(nI=width)
+        parametric_attention = (
+            tok2vec
+            >> list2ragged()
+            >> attention_layer
+            >> reduce_sum()
+            >> residual(nonlinear_layer >> norm_layer >> Dropout(0.0))
+        )
+
+        parametric_attention.init = _init_parametric_attention_with_residual_nonlinear
+
+        parametric_attention.set_ref("tok2vec", tok2vec)
+        parametric_attention.set_ref("attention_layer", attention_layer)
+        parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
+        parametric_attention.set_ref("norm_layer", norm_layer)
+
+        return parametric_attention
+
+
+def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
+    tok2vec_width = get_tok2vec_width(model)
+    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
+    model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
+    model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
+    model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
+    model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
+    init_chain(model, X, Y)
+    return model
+
+
 @registry.architectures("spacy.TextCatReduce.v1")
 def build_reduce_text_classifier(
     tok2vec: Model,
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 9ee93af0fef..2bba40d1d13 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -754,6 +754,9 @@ def test_overfitting_IO_multi():
         # CNN V2 (legacy)
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
+        # PARAMETRIC ATTENTION V1
+        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
+        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
         # REDUCE V1
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 63f723a28cf..956234ac0d4 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -1056,6 +1056,44 @@ the others, but may not be as accurate, especially if texts are short.
 
 </Accordion>
 
+### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"}
+
+> #### Example Config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.TextCatParametricAttention.v1"
+> exclusive_classes = true
+> nO = null
+>
+> [model.tok2vec]
+> @architectures = "spacy.Tok2Vec.v2"
+>
+> [model.tok2vec.embed]
+> @architectures = "spacy.MultiHashEmbed.v2"
+> width = 64
+> rows = [2000, 2000, 1000, 1000, 1000, 1000]
+> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+> include_static_vectors = false
+>
+> [model.tok2vec.encode]
+> @architectures = "spacy.MaxoutWindowEncoder.v2"
+> width = ${model.tok2vec.embed.width}
+> window_size = 1
+> maxout_pieces = 3
+> depth = 2
+> ```
+
+A neural network model that is built upon Tok2Vec and uses parametric attention
+to attend to tokens that are relevant to text classification.
+
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`           | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
+
 ### spacy.TextCatReduce.v1 {id="TextCatReduce"}
 
 > #### Example Config

From ab6fbe0dc4a8a27a5f8fbf530bb3d38cf3adea43 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 4 Dec 2023 15:23:28 +0100
Subject: [PATCH 323/504] Add documentation for EL task (#12988)

* Add documentation for EL task.

* Fix EL factory name.

* Add llm_entity_linker_mentio.

* Apply suggestions from code review

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Update EL task docs.

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Incorporate feedback.

* Format.

* Fix link to KB data.

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 website/docs/api/large-language-models.mdx | 172 ++++++++++++++++++++-
 1 file changed, 169 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/large-language-models.mdx b/website/docs/api/large-language-models.mdx
index cefd5c66ee1..583aa25a44d 100644
--- a/website/docs/api/large-language-models.mdx
+++ b/website/docs/api/large-language-models.mdx
@@ -20,9 +20,10 @@ An LLM component is implemented through the `LLMWrapper` class. It is accessible
 through a generic `llm`
 [component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories)
 as well as through task-specific component factories: `llm_ner`, `llm_spancat`,
-`llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization`,
-`llm_entity_linker`, `llm_raw` and `llm_translation`. For these factories, the
-GPT-3-5 model from OpenAI is used by default, but this can be customized.
+`llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization` and
+`llm_entity_linker`.
+
+### LLMWrapper.\_\_init\_\_ {id="init",tag="method"}
 
 > #### Example
 >
@@ -687,6 +688,171 @@ for a toy example of how such a KB file might look like.
 | -------- | ------------------------------------- |
 | `path`   | Path to KB file. ~~Union[str, Path]~~ |
 
+### EL (Entity Linking) {id="nel"}
+
+The EL links recognized entities (see [NER](#ner)) to those in a knowledge base
+(KB). The EL task prompts the LLM to select the most likely candidate from the
+KB, whose structure can be arbitrary.
+
+Note that the documents processed by the entity linking task are expected to
+have recognized entities in their `.ents` attribute. This can be achieved by
+either running the [NER task](#ner), using a trained spaCy NER model or setting
+the entities manually prior to running the EL task.
+
+In order to be able to pull data from the KB, an object implementing the
+`CandidateSelector` protocol has to be provided. This requires two functions:
+(1) `__call__()` to fetch candidate entities for entity mentions in the text
+(assumed to be available in `Doc.ents`) and (2) `get_entity_description()` to
+fetch descriptions for any given entity ID. Descriptions can be empty, but
+ideally provide more context for entities stored in the KB.
+
+`spacy-llm` provides a `CandidateSelector` implementation
+(`spacy.CandidateSelector.v1`) that leverages a spaCy knowledge base - as used
+in an `entity_linking` component - to select candidates. This knowledge base can
+be loaded from an existing spaCy pipeline (note that the pipeline's EL component
+doesn't have to be trained) or from a separate .yaml file.
+
+#### spacy.EntityLinker.v1 {id="el-v1"}
+
+Supports zero- and few-shot prompting. Relies on a configurable component
+suggesting viable entities before letting the LLM pick the most likely
+candidate.
+
+> #### Example config for spacy.EntityLinker.v1
+>
+> ```ini
+> [paths]
+> el_nlp = null
+>
+> ...
+>
+> [components.llm.task]
+> @llm_tasks = "spacy.EntityLinker.v1"
+>
+> [initialize]
+> [initialize.components]
+> [initialize.components.llm]
+> [initialize.components.llm.candidate_selector]
+> @llm_misc = "spacy.CandidateSelector.v1"
+>
+> # Load a KB from a KB file. For loading KBs from spaCy pipelines see spacy.KBObjectLoader.v1.
+> [initialize.components.llm.candidate_selector.kb_loader]
+> @llm_misc = "spacy.KBFileLoader.v1"
+> # Path to knowledge base .yaml file.
+> path = ${paths.el_kb}
+> ```
+
+| Argument              | Description                                                                                                                                                                                   |
+| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `template`            | Custom prompt template to send to LLM model. Defaults to [entity_linker.v1.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/entity_linker.v1.jinja). ~~str~~ |
+| `parse_responses`     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[EntityLinkerTask]]~~                                   |
+| `prompt_example_type` | Type to use for fewshot examples. Defaults to `ELExample`. ~~Optional[Type[FewshotExample]]~~                                                                                                 |
+| `examples`            | Optional callable that reads a file containing task examples for few-shot learning. If `None` is passed, zero-shot learning will be used. Defaults to `None`. ~~ExamplesConfigType~~          |
+| `scorer`              | Scorer function. Defaults to the metric used by spaCy to evaluate entity linking performance. ~~Optional[Scorer]~~                                                                            |
+
+##### spacy.CandidateSelector.v1 {id="candidate-selector-v1"}
+
+`spacy.CandidateSelector.v1` is an implementation of the `CandidateSelector`
+protocol required by [`spacy.EntityLinker.v1`](#el-v1). The built-in candidate
+selector method allows loading existing knowledge bases in several ways, e. g.
+loading from a spaCy pipeline with a (not necessarily trained) entity linking
+component, and loading from a file describing the knowlege base as a .yaml file.
+Either way the loaded data will be converted to a spaCy `InMemoryLookupKB`
+instance. The KB's selection capabilities are used to select the most likely
+entity candidates for the specified mentions.
+
+> #### Example config for spacy.CandidateSelector.v1
+>
+> ```ini
+> [initialize]
+> [initialize.components]
+> [initialize.components.llm]
+> [initialize.components.llm.candidate_selector]
+> @llm_misc = "spacy.CandidateSelector.v1"
+>
+> # Load a KB from a KB file. For loading KBs from spaCy pipelines see spacy.KBObjectLoader.v1.
+> [initialize.components.llm.candidate_selector.kb_loader]
+> @llm_misc = "spacy.KBFileLoader.v1"
+> # Path to knowledge base .yaml file.
+> path = ${paths.el_kb}
+> ```
+
+| Argument    | Description                                                       |
+| ----------- | ----------------------------------------------------------------- |
+| `kb_loader` | KB loader object. ~~InMemoryLookupKBLoader~~                      |
+| `top_n`     | Top-n candidates to include in the prompt. Defaults to 5. ~~int~~ |
+
+##### spacy.KBObjectLoader.v1 {id="kb-object-loader-v1"}
+
+Adheres to the `InMemoryLookupKBLoader` interface required by
+[`spacy.CandidateSelector.v1`](#candidate-selector-v1). Loads a knowledge base
+from an existing spaCy pipeline.
+
+> #### Example config for spacy.KBObjectLoader.v1
+>
+> ```ini
+> [initialize.components.llm.candidate_selector.kb_loader]
+> @llm_misc = "spacy.KBObjectLoader.v1"
+> # Path to knowledge base directory in serialized spaCy pipeline.
+> path = ${paths.el_kb}
+> # Path to spaCy pipeline. If this is not specified, spacy-llm tries to determine this automatically (but may fail).
+> nlp_path = ${paths.el_nlp}
+> # Path to file with descriptions for entity.
+> desc_path = ${paths.el_desc}
+> ```
+
+| Argument          | Description                                                                                                                                                                                                                         |
+| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path`            | Path to KB file. ~~Union[str, Path]~~                                                                                                                                                                                               |
+| `nlp_path`        | Path to serialized NLP pipeline. If None, path will be guessed. ~~Optional[Union[Path, str]]~~                                                                                                                                      |
+| `desc_path`       | Path to file with descriptions for entities. ~~int~~                                                                                                                                                                                |
+| `ent_desc_reader` | Entity description reader. Defaults to an internal method expecting a CSV file without header row, with ";" as delimiters, and with two columns - one for the entitys' IDs, one for their descriptions. ~~Optional[EntDescReader]~~ |
+
+##### spacy.KBFileLoader.v1 {id="kb-file-loader-v1"}
+
+Adheres to the `InMemoryLookupKBLoader` interface required by
+[`spacy.CandidateSelector.v1`](#candidate-selector-v1). Loads a knowledge base
+from a knowledge base file. The KB .yaml file has to stick to the following
+format:
+
+```yaml
+entities:
+  # The key should be whatever ID identifies this entity uniquely in your knowledge base.
+  ID1:
+      name: "..."
+      desc: "..."
+  ID2:
+      ...
+# Data on aliases in your knowledge base - e. g. "Apple" for the entity "Apple Inc.".
+aliases:
+  - alias: "..."
+    # List of all entities that this alias refers to.
+    entities: ["ID1", "ID2", ...]
+    # Optional: prior probabilities that this alias refers to the n-th entity in the "entities" attribute.
+    probabilities: [0.5, 0.2, ...]
+  - alias: "..."
+    entities: [...]
+    probabilities: [...]
+  ...
+```
+
+See
+[here](https://github.com/explosion/spacy-llm/blob/main/usage_examples/el_openai/el_kb_data.yml)
+for a toy example of how such a KB file might look like.
+
+> #### Example config for spacy.KBFileLoader.v1
+>
+> ```ini
+> [initialize.components.llm.candidate_selector.kb_loader]
+> @llm_misc = "spacy.KBFileLoader.v1"
+> # Path to knowledge base file.
+> path = ${paths.el_kb}
+> ```
+
+| Argument | Description                           |
+| -------- | ------------------------------------- |
+| `path`   | Path to KB file. ~~Union[str, Path]~~ |
+
 ### NER {id="ner"}
 
 The NER task identifies non-overlapping entities in text.

From aa938c54bf218f1b3735502264bf25fcab9dfc5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 10:28:46 +0100
Subject: [PATCH 324/504] Typing fixes

---
 requirements.txt           |  2 +-
 spacy/tokens/span.pyi      |  2 ++
 spacy/training/example.pyi |  4 ++++
 spacy/training/example.pyx |  6 ++++++
 spacy/training/loop.py     | 26 ++++++++++++++------------
 5 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e99ebc90ab2..bee5535257f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<1.1.0; platform_machine != "aarch64"
+mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 2a529593e5f..f1030278c69 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -139,6 +139,8 @@ class Span:
     def lemma_(self) -> str: ...
     @property
     def label_(self) -> str: ...
+    @label_.setter
+    def label_(self, label: str): ...
     @property
     def kb_id_(self) -> str: ...
     @property
diff --git a/spacy/training/example.pyi b/spacy/training/example.pyi
index 06639d70c06..33cf07b0902 100644
--- a/spacy/training/example.pyi
+++ b/spacy/training/example.pyi
@@ -9,6 +9,10 @@ def annotations_to_doc(
     tok_annot: Dict[str, Any],
     doc_annot: Dict[str, Any],
 ) -> Doc: ...
+def validate_distillation_examples(
+    examples: Iterable[Example],
+    method: str,
+) -> None: ...
 def validate_examples(
     examples: Iterable[Example],
     method: str,
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index a5e93125610..daa6ca3f468 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -57,6 +57,12 @@ def validate_examples(examples, method):
 
 
 def validate_distillation_examples(examples, method):
+    """Check that a batch of examples received during processing is valid
+    for distillation.
+
+    examples (Iterable[Examples]): A batch of examples.
+    method (str): The method name to show in error messages.
+    """
     validate_examples(examples, method)
     for eg in examples:
         if [token.text for token in eg.reference] != [token.text for token in eg.predicted]:
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 63715ec2c42..575a583b78c 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -12,7 +12,9 @@
     Iterable,
     List,
     Optional,
+    Sized,
     Tuple,
+    TypeVar,
     Union,
 )
 
@@ -22,7 +24,6 @@
 from .. import ty
 from ..errors import Errors
 from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
-from ..tokens.doc import Doc
 from ..util import (
     logger,
     registry,
@@ -282,7 +283,7 @@ def _distill_loop(
     teacher: "Language",
     student: "Language",
     optimizer: Optimizer,
-    distill_data: Iterable[List[Example]],
+    distill_data: Iterable[Tuple[int, List[Example]]],
     evaluate: Callable[[], Tuple[float, Dict[str, float]]],
     *,
     dropout: float,
@@ -401,7 +402,7 @@ def _distill_loop(
 def train_while_improving(
     nlp: "Language",
     optimizer: Optimizer,
-    train_data: Iterable[List[Example]],
+    train_data: Iterable[Tuple[int, List[Example]]],
     evaluate: Callable[[], Tuple[float, Dict[str, float]]],
     *,
     dropout: float,
@@ -520,15 +521,16 @@ def train_while_improving(
             break
 
 
+ItemT = TypeVar("ItemT", bound=Sized)
+
+
 def subdivide_batch(
-    batch: Union[Iterable[Doc], Iterable[Example]], accumulate_gradient: int
-):
+    batch: Iterable[ItemT], accumulate_gradient: int
+) -> Iterable[List[ItemT]]:
     batch = list(batch)
     if len(batch):
-        if isinstance(batch[0], Example):
-            batch.sort(key=lambda eg: len(eg.predicted))
-        else:
-            batch.sort(key=lambda doc: len(doc))
+        # Examples are sorted by their predicted length.
+        batch.sort(key=lambda item: len(item))
     sub_len = len(batch) // accumulate_gradient
     start = 0
     for i in range(accumulate_gradient):
@@ -578,7 +580,7 @@ def create_distill_batches(
     corpus: Callable[["Language"], Iterable[Example]],
     batcher: Callable[[Iterable[Example]], Iterable[List[Example]]],
     max_epochs: int,
-):
+) -> Iterable[Tuple[int, List[Example]]]:
     """Create distillation batches. In contrast to training, the corpus
     is normally too large to load into memory and shuffle."""
     epoch = 0
@@ -592,9 +594,9 @@ def create_distill_batches(
 def create_train_batches(
     nlp: "Language",
     corpus: Callable[["Language"], Iterable[Example]],
-    batcher: Callable[[Iterable[Example]], Iterable[Example]],
+    batcher: Callable[[Iterable[Example]], Iterable[List[Example]]],
     max_epochs: int,
-):
+) -> Iterable[Tuple[int, List[Example]]]:
     epoch = 0
     if max_epochs >= 0:
         examples = list(corpus(nlp))  # type: Iterable[Example]

From 0cac90e5ccd90e7d05495ee7466950151cc6f639 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 12:20:09 +0100
Subject: [PATCH 325/504] Py_UNICODE is not compatible with 3.12

---
 spacy/pipeline/_parser_internals/search.pyx |  2 +-
 spacy/tests/parser/_search.pyx              | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
index 578299b56ae..52d5cdaa891 100644
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -1,4 +1,4 @@
-# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
+# cython: experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
 from cymem.cymem cimport Pool
 from libc.math cimport exp
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
index cd9e6b2f5ee..ca2a2916094 100644
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@@ -12,7 +12,7 @@ from ..conftest import cytest
 cdef struct TestState:
     int length
     int x
-    Py_UNICODE* string
+    char *string
 
 
 cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1:
@@ -22,7 +22,7 @@ cdef int transition(void* dest, void* src, class_t clas, void* extra_args) excep
     dest_state.x = src_state.x
     dest_state.x += clas
     if extra_args != NULL:
-        dest_state.string = <Py_UNICODE*>extra_args
+        dest_state.string = <char *>extra_args
     else:
         dest_state.string = src_state.string
 
@@ -32,9 +32,9 @@ cdef void* initialize(Pool mem, int n, void* extra_args) except NULL:
     state.length = n
     state.x = 1
     if extra_args == NULL:
-        state.string = u'default'
+        state.string = 'default'
     else:
-        state.string = <Py_UNICODE*>extra_args
+        state.string = <char *>extra_args
     return state
 
 
@@ -77,7 +77,7 @@ def test_initialize(nr_class, beam_width, length):
     for i in range(b.width):
         s = <TestState*>b.at(i)
         assert s.length == length, s.length
-        assert s.string == 'default'
+        assert s.string.decode('utf8') == 'default'
 
 
 @cytest
@@ -88,11 +88,12 @@ def test_initialize(nr_class, beam_width, length):
                          ]
                          )
 def test_initialize_extra(nr_class, beam_width, length, extra):
+    extra = extra.encode("utf-8") if extra is not None else None
     b = Beam(nr_class, beam_width)
     if extra is None:
         b.initialize(initialize, destroy, length, NULL)
     else:
-        b.initialize(initialize, destroy, length, <void*><Py_UNICODE*>extra)
+        b.initialize(initialize, destroy, length, <void*><char*>extra)
     for i in range(b.width):
         s = <TestState*>b.at(i)
         assert s.length == length

From 6240456dd05ad584b83dabd4a36bde55c998955c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 14:59:01 +0100
Subject: [PATCH 326/504] Construct TextCatEnsemble.v2 using helper function

---
 spacy/ml/models/textcat.py | 44 +++++++-------------------------------
 1 file changed, 8 insertions(+), 36 deletions(-)

diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 4b3d2de9171..19ae2579984 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -19,6 +19,7 @@
     clone,
     concatenate,
     list2ragged,
+    noop,
     reduce_first,
     reduce_last,
     reduce_max,
@@ -160,55 +161,26 @@ def build_text_classifier_v2(
     linear_model: Model[List[Doc], Floats2d],
     nO: Optional[int] = None,
 ) -> Model[List[Doc], Floats2d]:
-    # TODO: build the model with _build_parametric_attention_with_residual_nonlinear
-    # in spaCy v4. We don't do this in spaCy v3 to preserve model
-    # compatibility.
+    width = tok2vec.maybe_get_dim("nO")
     exclusive_classes = not linear_model.attrs["multi_label"]
+    parametric_attention = _build_parametric_attention_with_residual_nonlinear(
+        tok2vec=tok2vec,
+        nonlinear_layer=Maxout(nI=width, nO=width),
+        key_transform=noop(),
+    )
     with Model.define_operators({">>": chain, "|": concatenate}):
-        width = tok2vec.maybe_get_dim("nO")
-        attention_layer = ParametricAttention(width)
-        maxout_layer = Maxout(nO=width, nI=width)
-        norm_layer = LayerNorm(nI=width)
-        cnn_model = (
-            tok2vec
-            >> list2ragged()
-            >> attention_layer
-            >> reduce_sum()
-            >> residual(maxout_layer >> norm_layer >> Dropout(0.0))
-        )
-
         nO_double = nO * 2 if nO else None
         if exclusive_classes:
             output_layer = Softmax(nO=nO, nI=nO_double)
         else:
             output_layer = Linear(nO=nO, nI=nO_double) >> Logistic()
-        model = (linear_model | cnn_model) >> output_layer
+        model = (linear_model | parametric_attention) >> output_layer
         model.set_ref("tok2vec", tok2vec)
     if model.has_dim("nO") is not False and nO is not None:
         model.set_dim("nO", cast(int, nO))
     model.set_ref("output_layer", linear_model.get_ref("output_layer"))
-    model.set_ref("attention_layer", attention_layer)
-    model.set_ref("maxout_layer", maxout_layer)
-    model.set_ref("norm_layer", norm_layer)
     model.attrs["multi_label"] = not exclusive_classes
 
-    model.init = init_ensemble_textcat  # type: ignore[assignment]
-    return model
-
-
-def init_ensemble_textcat(model, X, Y) -> Model:
-    # When tok2vec is lazily initialized, we need to initialize it before
-    # the rest of the chain to ensure that we can get its width.
-    tok2vec = model.get_ref("tok2vec")
-    tok2vec.initialize(X)
-
-    tok2vec_width = get_tok2vec_width(model)
-    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("maxout_layer").set_dim("nO", tok2vec_width)
-    model.get_ref("maxout_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
-    model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
-    init_chain(model, X, Y)
     return model
 
 

From c0b1c647827bd57685faee2fac5822d206ed90c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 15:02:02 +0100
Subject: [PATCH 327/504] Remove `setup_requires` from `setup.cfg`

---
 setup.cfg | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index b57fdc52bf9..8dcaf79d278 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,14 +30,6 @@ project_urls =
 zip_safe = false
 include_package_data = true
 python_requires = >=3.8
-setup_requires =
-    cython>=0.25,<3.0
-    numpy>=1.15.0
-    # We also need our Cython packages here to compile against
-    cymem>=2.0.2,<2.1.0
-    preshed>=3.0.2,<3.1.0
-    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.2.2,<8.3.0
 install_requires =
     # Our libraries
     spacy-legacy>=4.0.0.dev0,<4.1.0

From 2d7423997d63aecce481d550c2662df895e8f242 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 17:18:49 +0100
Subject: [PATCH 328/504] Fix up requirements test

To account for buil dependencies being removed from `setup.cfg`.
---
 spacy/tests/package/test_requirements.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index a63b1d8b060..86bdc730c19 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -67,26 +67,28 @@ def test_build_dependencies():
                     "{} and {} respectively".format(lib, v, req_v)
                 )
                 setup_keys.add(lib)
-    assert sorted(setup_keys) == sorted(
-        req_dict.keys()
-    )  # if fail: requirements.txt contains a lib not in setup.cfg
 
     # check pyproject.toml and compare the versions of the libs to requirements.txt
     # does not fail when there are missing or additional libs
     toml_file = root_dir / "pyproject.toml"
     with toml_file.open() as f:
         lines = f.readlines()
+    pyproject_keys = set()
     for line in lines:
         line = line.strip().strip(",").strip('"')
         if not line.startswith("#"):
             lib, v = _parse_req(line)
             if lib and lib not in libs_ignore_requirements:
+                pyproject_keys.add(lib)
                 req_v = req_dict.get(lib, None)
                 assert (lib + v) == (lib + req_v), (
                     "{} has different version in pyproject.toml and in requirements.txt: "
                     "{} and {} respectively".format(lib, v, req_v)
                 )
 
+    # if fail: requirements.txt contains a lib not in setup.cfg or pyproject.toml
+    assert set(setup_keys).union(set(pyproject_keys)) == set(req_dict.keys())
+
 
 def _parse_req(line):
     lib = re.match(r"^[a-z0-9\-]*", line).group(0)

From 2b00d56ce4baeb2431e3d23b3159dae311354b4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 25 Jan 2024 12:54:23 +0100
Subject: [PATCH 329/504] Set version to v4.0.0.dev2 (#13269)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 73f201af5fb..ef80718fee0 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "4.0.0.dev1"
+__version__ = "4.0.0.dev2"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From dd6aaefc315632936985540404b6e66f5eee5966 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 25 Jan 2024 18:24:22 +0100
Subject: [PATCH 330/504] Update `spacy-legacy` dependency to 4.0.0.dev1
 (#13270)

This release is compatible with the parser refactor backout.
---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index bee5535257f..4b58e75506d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=4.0.0.dev0,<4.1.0
+spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
diff --git a/setup.cfg b/setup.cfg
index 8dcaf79d278..55e6942622d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -32,7 +32,7 @@ include_package_data = true
 python_requires = >=3.8
 install_requires =
     # Our libraries
-    spacy-legacy>=4.0.0.dev0,<4.1.0
+    spacy-legacy>=4.0.0.dev1,<4.1.0
     spacy-loggers>=1.0.0,<2.0.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0

From d018b7bce05dba19b5f967b781f0577e7d9ebcda Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 6 Feb 2024 14:14:55 +0100
Subject: [PATCH 331/504] Remove debug data normalization for span analysis
 (#13203)

* Remove debug data normalization for span analysis

As a result of this normalization, `debug data` could show a user tokens
that do not exist in their data.

* Update spacy/cli/debug_data.py

---------

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/cli/debug_data.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 714969be145..7a98e6d563c 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1073,8 +1073,7 @@ def _get_distribution(docs, normalize: bool = True) -> Counter:
     word_counts: Counter = Counter()
     for doc in docs:
         for token in doc:
-            # Normalize the text
-            t = token.text.lower().replace("``", '"').replace("''", '"')
+            t = token.text.lower()
             word_counts[t] += 1
     if normalize:
         total = sum(word_counts.values(), 0.0)

From dc47b374608d163c0ec571814f185bb224a28ae7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 12 Jan 2022 13:38:52 +0100
Subject: [PATCH 332/504] Span/SpanGroup: wrap SpanC in shared_ptr (#9869)

* Span/SpanGroup: wrap SpanC in shared_ptr

When a Span that was retrieved from a SpanGroup was modified, these
changes were not reflected in the SpanGroup because the underlying
SpanC struct was copied.

This change applies the solution proposed by @nrodnova, to wrap SpanC in
a shared_ptr. This makes a SpanGroup and Spans derived from it share the
same SpanC. So, changes made through a Span are visible in the SpanGroup
as well.

Fixes #9556

* Test that a SpanGroup is modified through its Spans

* SpanGroup.push_back: remove nogil

Modifying std::vector is not thread-safe.

* C++ >= 11 does not allow const T in vector<T>

* Add Span.span_c as a shorthand for Span.c.get

Since this method is cdef'ed, it is only visible from Cython, so we
avoid using raw pointers in Python

Replace existing uses of span.c.get() to use this new method.

* Fix formatting

* Style fix: pointer types

* SpanGroup.to_bytes: reduce number of shared_ptr::get calls

* Mark SpanGroup modification test with issue

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/_parser_internals/ner.pyx |  17 ++-
 spacy/tests/doc/test_span.py             |  11 ++
 spacy/tokens/span.pxd                    |   1 +
 spacy/tokens/span.pyx                    | 147 ++++++++++++-----------
 spacy/tokens/span_group.pyx              |   2 +
 5 files changed, 109 insertions(+), 69 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index be769bd9cd0..7577f3f18b3 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -1,6 +1,8 @@
 import os
 import random
-
+from libc.stdint cimport int32_t
+from libcpp.memory cimport shared_ptr
+from libcpp.vector cimport vector
 from cymem.cymem cimport Pool
 from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
@@ -50,6 +52,7 @@ MOVE_NAMES[OUT] = 'O'
 cdef struct GoldNERStateC:
     Transition* ner
     vector[shared_ptr[SpanC]] negs
+    vector[shared_ptr[SpanC]] negs
 
 
 cdef class BiluoGold:
@@ -95,6 +98,8 @@ cdef GoldNERStateC create_gold_state(
     # In order to handle negative samples, we need to maintain the full
     # (start, end, label) triple. If we break it down to the 'isnt B-LOC'
     # thing, we'll get blocked if there's an incorrect prefix.
+    for neg in negs:
+        gs.negs.push_back(neg.c)
     for neg in negs:
         gs.negs.push_back(neg.c)
     return gs
@@ -413,6 +418,8 @@ cdef class Begin:
 
         cdef shared_ptr[SpanC] span
 
+        cdef shared_ptr[SpanC] span
+
         if g_act == MISSING:
             pass
         elif g_act == BEGIN:
@@ -430,6 +437,8 @@ cdef class Begin:
             # be correct or not. However, we can at least tell whether we're
             # going to be opening an entity where there's only one possible
             # L.
+            for span in gold.negs:
+                if span.get().label == label and span.get().start == b0:
             for span in gold.negs:
                 if span.get().label == label and span.get().start == b0:
                     cost += 1
@@ -573,6 +582,9 @@ cdef class Last:
         # by marking actions that close an entity that we know is incorrect
         # as costly.
         cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
+        cdef shared_ptr[SpanC] span
         for span in gold.negs:
             if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
                 cost += 1
@@ -639,6 +651,9 @@ cdef class Unit:
         # action
         cdef int b0 = s.B(0)
         cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
+        cdef shared_ptr[SpanC] span
         for span in gold.negs:
             if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
                 cost += 1
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index cf850a2234d..8452a5152aa 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -6,6 +6,7 @@
 from spacy.attrs import LENGTH, ORTH
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span, SpanGroup, Token
+from spacy.vocab import Vocab
 from spacy.util import filter_spans
 from spacy.vocab import Vocab
 
@@ -163,6 +164,16 @@ def test_char_span(doc, i_sent, i, j, text):
         assert span.text == text
 
 
+@pytest.mark.issue(9556)
+def test_modify_span_group(doc):
+    group = SpanGroup(doc, spans=doc.ents)
+    for span in group:
+        span.start = 0
+        span.label = doc.vocab.strings["TEST"]
+
+    # Span changes must be reflected in the span group
+    assert group[0].start == 0
+    assert group[0].label == doc.vocab.strings["TEST"]
 @pytest.mark.issue(9556)
 def test_modify_span_group(doc):
     group = SpanGroup(doc, spans=doc.ents)
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index fb592e68bd8..68f722a13cb 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,3 +1,4 @@
+from libcpp.memory cimport shared_ptr
 cimport numpy as np
 from libcpp.memory cimport shared_ptr
 
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 332123ad774..e84a15c7902 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -3,9 +3,6 @@ cimport numpy as np
 from libc.math cimport sqrt
 from libcpp.memory cimport make_shared
 
-import copy
-import warnings
-
 import numpy
 from thinc.api import get_array_module
 
@@ -118,6 +115,7 @@ cdef class Span:
             end_char = start_char
         else:
             end_char = doc[end - 1].idx + len(doc[end - 1])
+        self.c = make_shared[SpanC](SpanC(
         self.c = make_shared[SpanC](SpanC(
             label=label,
             kb_id=kb_id,
@@ -127,6 +125,7 @@ cdef class Span:
             start_char=start_char,
             end_char=end_char,
         ))
+        ))
         self._vector = vector
         self._vector_norm = vector_norm
 
@@ -137,14 +136,17 @@ cdef class Span:
             else:
                 return True
 
-        self_tuple = self._cmp_tuple()
-        other_tuple = other._cmp_tuple()
+        cdef SpanC* span_c = self.span_c()
+        cdef SpanC* other_span_c = other.span_c()
+
         # <
         if op == 0:
             return span_c.start_char < other_span_c.start_char
+            return span_c.start_char < other_span_c.start_char
         # <=
         elif op == 1:
             return span_c.start_char <= other_span_c.start_char
+            return span_c.start_char <= other_span_c.start_char
         # ==
         elif op == 2:
             # Do the cheap comparisons first
@@ -155,6 +157,14 @@ cdef class Span:
                 (span_c.kb_id == other_span_c.kb_id) and \
                 (self.doc == other.doc)
             )
+            # Do the cheap comparisons first
+            return (
+                (span_c.start_char == other_span_c.start_char) and \
+                (span_c.end_char == other_span_c.end_char) and \
+                (span_c.label == other_span_c.label) and \
+                (span_c.kb_id == other_span_c.kb_id) and \
+                (self.doc == other.doc)
+            )
         # !=
         elif op == 3:
             # Do the cheap comparisons first
@@ -165,28 +175,26 @@ cdef class Span:
                 (span_c.kb_id == other_span_c.kb_id) and \
                 (self.doc == other.doc)
             )
+            # Do the cheap comparisons first
+            return not (
+                (span_c.start_char == other_span_c.start_char) and \
+                (span_c.end_char == other_span_c.end_char) and \
+                (span_c.label == other_span_c.label) and \
+                (span_c.kb_id == other_span_c.kb_id) and \
+                (self.doc == other.doc)
+            )
         # >
         elif op == 4:
             return span_c.start_char > other_span_c.start_char
+            return span_c.start_char > other_span_c.start_char
         # >=
         elif op == 5:
             return span_c.start_char >= other_span_c.start_char
+            return span_c.start_char >= other_span_c.start_char
 
     def __hash__(self):
-        return hash(self._cmp_tuple())
-
-    def _cmp_tuple(self):
         cdef SpanC* span_c = self.span_c()
-        return (
-            span_c.start_char,
-            span_c.end_char,
-            span_c.start,
-            span_c.end,
-            span_c.label,
-            span_c.kb_id,
-            span_c.id,
-            self.doc,
-        )
+        return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id))
 
     def __len__(self):
         """Get the number of tokens in the span.
@@ -196,9 +204,12 @@ cdef class Span:
         DOCS: https://spacy.io/api/span#len
         """
         cdef SpanC* span_c = self.span_c()
+        if span_c.end < span_c.start:
+        cdef SpanC* span_c = self.span_c()
         if span_c.end < span_c.start:
             return 0
         return span_c.end - span_c.start
+        return span_c.end - span_c.start
 
     def __repr__(self):
         return self.text
@@ -213,14 +224,18 @@ cdef class Span:
         DOCS: https://spacy.io/api/span#getitem
         """
         cdef SpanC* span_c = self.span_c()
+        cdef SpanC* span_c = self.span_c()
         if isinstance(i, slice):
             start, end = normalize_slice(len(self), i.start, i.stop, i.step)
             return Span(self.doc, start + self.start, end + self.start)
         else:
             if i < 0:
                 token_i = span_c.end + i
+                token_i = span_c.end + i
             else:
                 token_i = span_c.start + i
+            if span_c.start <= token_i < span_c.end:
+                token_i = span_c.start + i
             if span_c.start <= token_i < span_c.end:
                 return self.doc[token_i]
             else:
@@ -234,6 +249,8 @@ cdef class Span:
         DOCS: https://spacy.io/api/span#iter
         """
         cdef SpanC* span_c = self.span_c()
+        for i in range(span_c.start, span_c.end):
+        cdef SpanC* span_c = self.span_c()
         for i in range(span_c.start, span_c.end):
             yield self.doc[i]
 
@@ -242,10 +259,11 @@ cdef class Span:
 
     @property
     def _(self):
+        cdef SpanC* span_c = self.span_c()
         """Custom extension attributes registered via `set_extension`."""
         cdef SpanC* span_c = self.span_c()
         return Underscore(Underscore.span_extensions, self,
-                          start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                          start=span_c.start_char, end=span_c.end_char)
 
     def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None):
         """Create a `Doc` object with a copy of the `Span`'s data.
@@ -329,6 +347,7 @@ cdef class Span:
         cdef attr_t value
         cdef int i, head_col, ancestor_i
         cdef SpanC* span_c = self.span_c()
+        cdef SpanC* span_c = self.span_c()
         old_to_new_root = dict()
         if HEAD in attrs:
             head_col = attrs.index(HEAD)
@@ -336,6 +355,7 @@ cdef class Span:
                 # if the HEAD refers to a token outside this span, find a more appropriate ancestor
                 token = self[i]
                 ancestor_i = token.head.i - span_c.start   # span offset
+                ancestor_i = token.head.i - span_c.start   # span offset
                 if ancestor_i not in range(length):
                     if DEP in attrs:
                         array[i, attrs.index(DEP)] = dep
@@ -343,6 +363,7 @@ cdef class Span:
                     # try finding an ancestor within this span
                     ancestors = token.ancestors
                     for ancestor in ancestors:
+                        ancestor_i = ancestor.i - span_c.start
                         ancestor_i = ancestor.i - span_c.start
                         if ancestor_i in range(length):
                             array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
@@ -374,6 +395,8 @@ cdef class Span:
         """
         cdef SpanC* span_c = self.span_c()
         return numpy.asarray(_get_lca_matrix(self.doc, span_c.start, span_c.end))
+        cdef SpanC* span_c = self.span_c()
+        return numpy.asarray(_get_lca_matrix(self.doc, span_c.start, span_c.end))
 
     def similarity(self, other):
         """Make a semantic similarity estimate. The default estimate is cosine
@@ -486,6 +509,9 @@ cdef class Span:
     cdef SpanC* span_c(self):
         return self.c.get()
 
+    cdef SpanC* span_c(self):
+        return self.c.get()
+
     @property
     def sents(self):
         """Obtain the sentences that contain this span. If the given span
@@ -544,9 +570,14 @@ cdef class Span:
         cdef Span ent
         cdef SpanC* span_c = self.span_c()
         cdef SpanC* ent_span_c
+        cdef SpanC* span_c = self.span_c()
+        cdef SpanC* ent_span_c
         ents = []
         for ent in self.doc.ents:
             ent_span_c = ent.span_c()
+            if ent_span_c.start >= span_c.start:
+                if ent_span_c.end <= span_c.end:
+            ent_span_c = ent.span_c()
             if ent_span_c.start >= span_c.start:
                 if ent_span_c.end <= span_c.end:
                     ents.append(ent)
@@ -673,10 +704,12 @@ cdef class Span:
         # 'gov'. But we went with 'head' elsewhere, and now we're stuck =/
         cdef int i
         cdef SpanC* span_c = self.span_c()
+        cdef SpanC* span_c = self.span_c()
         # First, we scan through the Span, and check whether there's a word
         # with head==0, i.e. a sentence root. If so, we can return it. The
         # longer the span, the more likely it contains a sentence root, and
         # in this case we return in linear time.
+        for i in range(span_c.start, span_c.end):
         for i in range(span_c.start, span_c.end):
             if self.doc.c[i].head == 0:
                 return self.doc[i]
@@ -688,6 +721,8 @@ cdef class Span:
         # think this should be okay.
         cdef int current_best = self.doc.length
         cdef int root = -1
+        for i in range(span_c.start, span_c.end):
+            if span_c.start <= (i+self.doc.c[i].head) < span_c.end:
         for i in range(span_c.start, span_c.end):
             if span_c.start <= (i+self.doc.c[i].head) < span_c.end:
                 continue
@@ -697,6 +732,7 @@ cdef class Span:
                 root = i
         if root == -1:
             return self.doc[span_c.start]
+            return self.doc[span_c.start]
         else:
             return self.doc[root]
 
@@ -722,6 +758,10 @@ cdef class Span:
         start_idx += span_c.start_char
         end_idx += span_c.start_char
         return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
+        cdef SpanC* span_c = self.span_c()
+        start_idx += span_c.start_char
+        end_idx += span_c.start_char
+        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
 
     @property
     def conjuncts(self):
@@ -802,87 +842,58 @@ cdef class Span:
     property start:
         def __get__(self):
             return self.span_c().start
+            return self.span_c().start
 
         def __set__(self, int start):
-            if start < 0 or start > self.doc.length:
-                raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
-            cdef SpanC* span_c = self.span_c()
-            if start > span_c.end:
-                raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
-            span_c.start = start
-            span_c.start_char = self.doc.c[start].idx
+            if start < 0:
+                raise IndexError("TODO")
+            self.span_c().start = start
 
     property end:
         def __get__(self):
             return self.span_c().end
+            return self.span_c().end
 
         def __set__(self, int end):
-            if end < 0 or end > self.doc.length:
-                raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
-            cdef SpanC* span_c = self.span_c()
-            if span_c.start > end:
-                raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
-            span_c.end = end
-            if end > 0:
-                span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length
-            else:
-                span_c.end_char = 0
+            if end < 0:
+                raise IndexError("TODO")
+            self.span_c().end = end
 
     property start_char:
         def __get__(self):
             return self.span_c().start_char
+            return self.span_c().start_char
 
         def __set__(self, int start_char):
-            if start_char < 0 or start_char > len(self.doc.text):
-                raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
-            cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
-            if start < 0:
-                raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
-            cdef SpanC* span_c = self.span_c()
-            if start_char > span_c.end_char:
-                raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char))
-            span_c.start_char = start_char
-            span_c.start = start
+            if start_char < 0:
+                raise IndexError("TODO")
+            self.span_c().start_char = start_char
 
     property end_char:
         def __get__(self):
             return self.span_c().end_char
+            return self.span_c().end_char
 
         def __set__(self, int end_char):
-            if end_char < 0 or end_char > len(self.doc.text):
-                raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
-            cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
-            if end < 0:
-                raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
-            cdef SpanC* span_c = self.span_c()
-            if span_c.start_char > end_char:
-                raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char))
-            span_c.end_char = end_char
-            span_c.end = end
+            if end_char < 0:
+                raise IndexError("TODO")
+            self.span_c().end_char = end_char
 
     property label:
         def __get__(self):
             return self.span_c().label
+            return self.span_c().label
 
         def __set__(self, attr_t label):
-            if label != self.span_c().label :
-                old_label = self.span_c().label
-                self.span_c().label = label
-                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
-                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id)
-                Underscore._replace_keys(old, new)
+            self.span_c().label = label
 
     property kb_id:
         def __get__(self):
             return self.span_c().kb_id
+            return self.span_c().kb_id
 
         def __set__(self, attr_t kb_id):
-            if kb_id != self.span_c().kb_id :
-                old_kb_id = self.span_c().kb_id
-                self.span_c().kb_id = kb_id
-                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
-                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id)
-                Underscore._replace_keys(old, new)
+            self.span_c().kb_id = kb_id
 
     property id:
         def __get__(self):
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index bc5bb92d38c..55e70a91801 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -10,6 +10,8 @@ from spacy.errors import Errors
 from libcpp.memory cimport make_shared
 
 from .span cimport Span
+from libc.stdint cimport uint64_t, uint32_t, int32_t
+from libcpp.memory cimport make_shared
 
 
 cdef class SpanGroup:

From f8ea451661a931cb890ce5f4ec377b6c510ff423 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 15 Jul 2022 11:14:08 +0200
Subject: [PATCH 333/504] `Morphology`/`Morphologizer` optimizations and
 refactoring (#11024)

* `Morphology`: Refactor to use C types, reduce allocations, remove unused code

* `Morphologzier`: Avoid unnecessary sorting of morpho features

* `Morphologizer`: Remove execessive reallocations of labels, improve hash lookups of labels, coerce `numpy` numeric types to native ints
Update docs

* Remove unused method

* Replace `unique_ptr` usage with `shared_ptr`

* Add type annotations to internal Python methods, rename `hash` variable, fix typos

* Add comment to clarify implementation detail

* Fix return type

* `Morphology`: Stop early when splitting fields and values
---
 spacy/morphology.pxd               | 15 +++++-
 spacy/morphology.pyx               | 77 +++++++++++++++++++++++++++---
 spacy/pipeline/morphologizer.pyx   |  3 +-
 spacy/tokens/morphanalysis.pxd     | 10 ++--
 spacy/tokens/morphanalysis.pyx     |  7 +++
 spacy/tokens/token.pyx             |  1 +
 website/docs/api/morphologizer.mdx |  4 +-
 7 files changed, 100 insertions(+), 17 deletions(-)

diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 5138d353cf0..7f833e96df1 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,8 +1,8 @@
 cimport numpy as np
 from libc.stdint cimport uint32_t, uint64_t
-from libcpp.memory cimport shared_ptr
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.memory cimport shared_ptr
 
 from .strings cimport StringStore
 from .structs cimport MorphAnalysisC
@@ -19,7 +19,7 @@ cdef cppclass Feature:
 
 
 cdef cppclass MorphAnalysisC:
-    hash_t key  
+    hash_t key
     vector[Feature] features
 
     __init__():
@@ -28,6 +28,7 @@ cdef cppclass MorphAnalysisC:
 cdef class Morphology:
     cdef readonly StringStore strings
     cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags
+    cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags
 
     cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash)
     cdef void _intern_morph_tag(self, hash_t tag_key, feats)
@@ -35,8 +36,18 @@ cdef class Morphology:
     cdef str _normalize_features(self, features)
     cdef str get_morph_str(self, hash_t morph_key)
     cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key)
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash)
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats)
+    cdef hash_t _add(self, features)
+    cdef str _normalize_features(self, features)
+    cdef str get_morph_str(self, hash_t morph_key)
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key)
 
 cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil
 cdef list list_features(const shared_ptr[MorphAnalysisC] morph)
 cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field)
 cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph)
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field)
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index e7f93b78b47..a7d1c51eab6 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,12 +1,10 @@
 # cython: infer_types
 import warnings
-from typing import Dict, List, Optional, Tuple, Union
-
-import numpy
-
+from typing import Union, Tuple, List, Dict, Optional
 from cython.operator cimport dereference as deref
 from libcpp.memory cimport shared_ptr
 
+from .errors import Warnings
 from . import symbols
 from .errors import Warnings
 
@@ -80,13 +78,15 @@ cdef class Morphology:
         out.sort(key=lambda x: x[0])
         return dict(out)
 
+
     def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
         norm_feats_string = self.FEATURE_SEP.join([
-            self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
+                self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
             for field, values in feats.items()
-            ])
+        ])
         return norm_feats_string or self.EMPTY_MORPH
 
+
     cdef hash_t _add(self, features):
         """Insert a morphological analysis in the morphology table, if not
         already present. The morphological analysis may be provided in the UD
@@ -95,10 +95,22 @@ cdef class Morphology:
         """
         cdef hash_t tag_hash = 0
         cdef shared_ptr[MorphAnalysisC] tag
+        cdef hash_t tag_hash = 0
+        cdef shared_ptr[MorphAnalysisC] tag
         if isinstance(features, str):
             if features == "":
                 features = self.EMPTY_MORPH
 
+            tag_hash = self.strings[features]
+            tag = self._lookup_tag(tag_hash)
+            if tag:
+                return deref(tag).key
+
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
+        else:
+
             tag_hash = self.strings[features]
             tag = self._lookup_tag(tag_hash)
             if tag:
@@ -111,6 +123,7 @@ cdef class Morphology:
             warnings.warn(Warnings.W100.format(feature=features))
             features = {}
 
+
         # the hash key for the tag is either the hash of the normalized UFEATS
         # string or the hash of an empty placeholder
         norm_feats_string = self._normalized_feat_dict_to_str(features)
@@ -138,7 +151,7 @@ cdef class Morphology:
                     field_feature_pairs.append((field_key, value_key))
             else:
                 # We could box scalar values into a list and use a common
-                # code path to generate features but that incurs a small 
+                # code path to generate features but that incurs a small
                 # but measurable allocation/iteration overhead (as this
                 # branch is taken often enough).
                 value_key = self.strings.add(field + self.FIELD_SEP + values)
@@ -165,6 +178,17 @@ cdef class Morphology:
     cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
         return self._lookup_tag(morph_key)
 
+    cdef str _normalize_features(self, features):
+    cdef str get_morph_str(self, hash_t morph_key):
+        cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key)
+        if not tag:
+            return ""
+        else:
+            return self.strings[deref(tag).key]
+
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
+        return self._lookup_tag(morph_key)
+
     cdef str _normalize_features(self, features):
         """Create a normalized FEATS string from a features string or dict.
 
@@ -175,6 +199,10 @@ cdef class Morphology:
             features = self._str_to_normalized_feat_dict(features)
         elif isinstance(features, dict):
             features = self._dict_to_normalized_feat_dict(features)
+        else:
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
         else:
             warnings.warn(Warnings.W100.format(feature=features))
             features = {}
@@ -187,10 +215,22 @@ cdef class Morphology:
     def get(self, morph_key):
         return self.get_morph_str(morph_key)
 
+    def normalize_features(self, features):
+        return self._normalize_features(features)
+
+        return self._normalized_feat_dict_to_str(features)
+
+    def add(self, features):
+        return self._add(features)
+
+    def get(self, morph_key):
+        return self.get_morph_str(morph_key)
+
     def normalize_features(self, features):
         return self._normalize_features(features)
 
     @staticmethod
+    def feats_to_dict(feats, *, sort_values=True):
     def feats_to_dict(feats, *, sort_values=True):
         if not feats or feats == Morphology.EMPTY_MORPH:
             return {}
@@ -206,6 +246,17 @@ cdef class Morphology:
             out[field] = values
         return out
 
+        out = {}
+        for feat in feats.split(Morphology.FEATURE_SEP):
+            field, values = feat.split(Morphology.FIELD_SEP, 1)
+            if sort_values:
+                values = values.split(Morphology.VALUE_SEP)
+                values.sort()
+                values = Morphology.VALUE_SEP.join(values)
+
+            out[field] = values
+        return out
+
     @staticmethod
     def dict_to_feats(feats_dict):
         if len(feats_dict) == 0:
@@ -213,31 +264,43 @@ cdef class Morphology:
         return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
 
 
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
 cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
     cdef int i
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].value == feature:
     for i in range(deref(morph).features.size()):
         if deref(morph).features[i].value == feature:
             return True
     return False
 
 
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
 cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
     cdef int i
     features = []
+    for i in range(deref(morph).features.size()):
+        features.append(deref(morph).features[i].value)
     for i in range(deref(morph).features.size()):
         features.append(deref(morph).features[i].value)
     return features
 
 
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
+    cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
 cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
     cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
     n = get_n_by_field(<uint64_t*>results.data, morph, field)
     return results[:n]
 
 
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
 cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
     cdef int n_results = 0
     cdef int i
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].field == field:
+            results[n_results] = deref(morph).features[i].value
     for i in range(deref(morph).features.size()):
         if deref(morph).features[i].field == field:
             results[n_results] = deref(morph).features[i].value
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 0f77326e67d..43e36b36844 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -294,8 +294,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
-                                                    label_smoothing=self.cfg["label_smoothing"])
+        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
index 73922c62b9b..33322d0187f 100644
--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@@ -1,8 +1,7 @@
-from libcpp.memory cimport shared_ptr
-
-from ..morphology cimport MorphAnalysisC
-from ..typedefs cimport hash_t
 from ..vocab cimport Vocab
+from ..typedefs cimport hash_t
+from ..morphology cimport MorphAnalysisC
+from libcpp.memory cimport shared_ptr
 
 
 cdef class MorphAnalysis:
@@ -11,3 +10,6 @@ cdef class MorphAnalysis:
     cdef shared_ptr[MorphAnalysisC] c
 
     cdef void _init_c(self, hash_t key)
+    cdef shared_ptr[MorphAnalysisC] c
+
+    cdef void _init_c(self, hash_t key)
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index f3841baa24a..2ee7565ea85 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -9,6 +9,13 @@ from libcpp.memory cimport shared_ptr
 from ..morphology cimport MorphAnalysisC, check_feature, get_by_field, list_features
 from ..typedefs cimport attr_t, hash_t
 from ..vocab cimport Vocab
+from ..typedefs cimport hash_t, attr_t
+from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
+from libcpp.memory cimport shared_ptr
+from cython.operator cimport dereference as deref
+
+
+cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
 
 
 cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 47b4898bb75..66cae659e75 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -40,6 +40,7 @@ from .. import parts_of_speech
 from ..attrs import IOB_STRINGS
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
+from cython.operator cimport dereference as deref
 
 from cython.operator cimport dereference as deref
 
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 9514bc773b9..61abe043e77 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -454,8 +454,8 @@ coarse-grained POS as the feature `POS`.
 > assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels
 > ```
 
-| Name        | Description                                               |
-| ----------- | --------------------------------------------------------- |
+| Name        | Description                                            |
+| ----------- | ------------------------------------------------------ |
 | **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ |
 
 ## Morphologizer.label_data {id="label_data",tag="property",version="3"}

From 86064b7fa311cca4ea4feb5ad6e1f77623eb5fcf Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 10 Aug 2022 11:44:05 +0200
Subject: [PATCH 334/504] Rename modules for consistency (#11286)

* rename Python module to entity_ruler

* rename Python module to attribute_ruler
---
 spacy/pipeline/__init__.py            |   2 +
 website/docs/api/entityruler.mdx      | 298 ++++++++++++++++++++++----
 website/docs/usage/saving-loading.mdx |   6 +-
 3 files changed, 266 insertions(+), 40 deletions(-)

diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index e26f7436efa..af5bb05a0f7 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -1,8 +1,10 @@
 from .attribute_ruler import AttributeRuler
+from .attribute_ruler import AttributeRuler
 from .dep_parser import DependencyParser
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
 from .ner import EntityRecognizer
+from .entity_ruler import EntityRuler
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .ner import EntityRecognizer
diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx
index 7976e7725e0..293162572c6 100644
--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@@ -1,5 +1,7 @@
 ---
 title: EntityRuler
+tag: class
+source: spacy/pipeline/entity_ruler.py
 new: 2.1
 teaser: 'Pipeline component for rule-based named entity recognition'
 api_string_name: entity_ruler
@@ -75,51 +77,273 @@ how the component should be configured. You can override its settings via the
 | `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                       |
 | `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                 |
 
-## Migrating from v3 {#migrating}
+```python
+%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py
+```
 
-### Loading patterns
+## EntityRuler.\_\_init\_\_ {id="init",tag="method"}
 
-Unlike the v3 `EntityRuler`, the `SpanRuler` cannot load patterns on
-initialization with `SpanRuler(patterns=patterns)` or directly from a JSONL file
-path with `SpanRuler.from_disk(jsonl_path)`. Patterns should be loaded from the
-JSONL file separately and then added through
-[`SpanRuler.initialize`](/api/spanruler#initialize]) or
-[`SpanRuler.add_patterns`](/api/spanruler#add_patterns).
+Initialize the entity ruler. If patterns are supplied here, they need to be a
+list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either
+be a token pattern (list) or a phrase pattern (string). For example:
+`{"label": "ORG", "pattern": "Apple"}`.
 
-```diff
- ruler = nlp.get_pipe("entity_ruler")
-- ruler.from_disk("patterns.jsonl")
-+ import srsly
-+ patterns = srsly.read_jsonl("patterns.jsonl")
-+ ruler.add_patterns(patterns)
-```
+> #### Example
+>
+> ```python
+> # Construction via add_pipe
+> ruler = nlp.add_pipe("entity_ruler")
+>
+> # Construction from class
+> from spacy.pipeline import EntityRuler
+> ruler = EntityRuler(nlp, overwrite_ents=True)
+> ```
 
-### Saving patterns
+| Name                                                 | Description                                                                                                                                                                                                                           |
+| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `nlp`                                                | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~                                                                                                                                     |
+| `name` <Tag variant="new">3</Tag>                    | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
+| _keyword-only_                                       |                                                                                                                                                                                                                                       |
+| `phrase_matcher_attr`                                | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~                                         |
+| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~                                                                                                     |
+| `validate`                                           | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~                                                                                                                |
+| `overwrite_ents`                                     | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~                                                                                             |
+| `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                                                               |
+| `patterns`                                           | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~                                                                                                                                 |
+| `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                                                         |
 
-`SpanRuler.to_disk` always saves the full component data to a directory and does
-not include an option to save the patterns to a single JSONL file.
+## EntityRuler.initialize {id="initialize",tag="method",version="3"}
 
-```diff
- ruler = nlp.get_pipe("entity_ruler")
-- ruler.to_disk("patterns.jsonl")
-+ import srsly
-+ srsly.write_jsonl("patterns.jsonl", ruler.patterns)
-```
+Initialize the component with data and used before training to load in rules
+from a [pattern file](/usage/rule-based-matching/#entityruler-files). This
+method is typically called by [`Language.initialize`](/api/language#initialize)
+and lets you customize arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
 
-### Accessing token and phrase patterns
+> #### Example
+>
+> ```python
+> entity_ruler = nlp.add_pipe("entity_ruler")
+> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
+> ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.entity_ruler]
+>
+> [initialize.components.entity_ruler.patterns]
+> @readers = "srsly.read_jsonl.v1"
+> path = "corpus/entity_ruler_patterns.jsonl
+> ```
 
-The separate token patterns and phrase patterns are no longer accessible under
-`ruler.token_patterns` or `ruler.phrase_patterns`. You can access the combined
-patterns in their original format using the property
-[`SpanRuler.patterns`](/api/spanruler#patterns).
+| Name           | Description                                                                                                                                                          |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
+| _keyword-only_ |                                                                                                                                                                      |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                 |
+| `patterns`     | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~                                                        |
 
-### Removing patterns by ID
+## EntityRuler.\_\_len\_\_ {id="len",tag="method"}
 
-[`SpanRuler.remove`](/api/spanruler#remove) removes by label rather than ID. To
-remove by ID, use [`SpanRuler.remove_by_id`](/api/spanruler#remove_by_id):
+The number of all patterns added to the entity ruler.
 
-```diff
- ruler = nlp.get_pipe("entity_ruler")
-- ruler.remove("id")
-+ ruler.remove_by_id("id")
-```
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> assert len(ruler) == 0
+> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
+> assert len(ruler) == 1
+> ```
+
+| Name        | Description                     |
+| ----------- | ------------------------------- |
+| **RETURNS** | The number of patterns. ~~int~~ |
+
+## EntityRuler.\_\_contains\_\_ {id="contains",tag="method"}
+
+Whether a label is present in the patterns.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
+> assert "ORG" in ruler
+> assert not "PERSON" in ruler
+> ```
+
+| Name        | Description                                           |
+| ----------- | ----------------------------------------------------- |
+| `label`     | The label to check. ~~str~~                           |
+| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ |
+
+## EntityRuler.\_\_call\_\_ {id="call",tag="method"}
+
+Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
+happens automatically after the component has been added to the pipeline using
+[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
+with `overwrite_ents=True`, existing entities will be replaced if they overlap
+with the matches. When matches overlap in a Doc, the entity ruler prioritizes
+longer patterns over shorter, and if equal the match occuring first in the Doc
+is chosen.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
+>
+> doc = nlp("A text about Apple.")
+> ents = [(ent.text, ent.label_) for ent in doc.ents]
+> assert ents == [("Apple", "ORG")]
+> ```
+
+| Name        | Description                                                          |
+| ----------- | -------------------------------------------------------------------- |
+| `doc`       | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
+| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~        |
+
+## EntityRuler.add_patterns {id="add_patterns",tag="method"}
+
+Add patterns to the entity ruler. A pattern can either be a token pattern (list
+of dicts) or a phrase pattern (string). For more details, see the usage guide on
+[rule-based matching](/usage/rule-based-matching).
+
+> #### Example
+>
+> ```python
+> patterns = [
+>     {"label": "ORG", "pattern": "Apple"},
+>     {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}
+> ]
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns(patterns)
+> ```
+
+| Name       | Description                                                      |
+| ---------- | ---------------------------------------------------------------- |
+| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
+
+## EntityRuler.remove {id="remove",tag="method",version="3.2.1"}
+
+Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if
+the ID does not exist.
+
+> #### Example
+>
+> ```python
+> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}]
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns(patterns)
+> ruler.remove("apple")
+> ```
+
+| Name | Description                         |
+| ---- | ----------------------------------- |
+| `id` | The ID of the pattern rule. ~~str~~ |
+
+## EntityRuler.to_disk {id="to_disk",tag="method"}
+
+Save the entity ruler patterns to a directory. The patterns will be saved as
+newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided,
+only the patterns are saved as JSONL. If a directory name is provided, a
+`patterns.jsonl` and `cfg` file with the component configuration is exported.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.to_disk("/path/to/patterns.jsonl")  # saves patterns only
+> ruler.to_disk("/path/to/entity_ruler")    # saves patterns and config
+> ```
+
+| Name   | Description                                                                                                                                              |
+| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+
+## EntityRuler.from_disk {id="from_disk",tag="method"}
+
+Load the entity ruler from a path. Expects either a file containing
+newline-delimited JSON (JSONL) with one entry per line, or a directory
+containing a `patterns.jsonl` file and a `cfg` file with the component
+configuration.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.from_disk("/path/to/patterns.jsonl")  # loads patterns only
+> ruler.from_disk("/path/to/entity_ruler")    # loads patterns and config
+> ```
+
+| Name        | Description                                                                                                   |
+| ----------- | ------------------------------------------------------------------------------------------------------------- |
+| `path`      | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~                                                            |
+
+## EntityRuler.to_bytes {id="to_bytes",tag="method"}
+
+Serialize the entity ruler patterns to a bytestring.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler_bytes = ruler.to_bytes()
+> ```
+
+| Name        | Description                        |
+| ----------- | ---------------------------------- |
+| **RETURNS** | The serialized patterns. ~~bytes~~ |
+
+## EntityRuler.from_bytes {id="from_bytes",tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> ruler_bytes = ruler.to_bytes()
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.from_bytes(ruler_bytes)
+> ```
+
+| Name         | Description                                        |
+| ------------ | -------------------------------------------------- |
+| `bytes_data` | The bytestring to load. ~~bytes~~                  |
+| **RETURNS**  | The modified `EntityRuler` object. ~~EntityRuler~~ |
+
+## EntityRuler.labels {id="labels",tag="property"}
+
+All labels present in the match patterns.
+
+| Name        | Description                            |
+| ----------- | -------------------------------------- |
+| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ |
+
+## EntityRuler.ent_ids {id="ent_ids",tag="property",version="2.2.2"}
+
+All entity IDs present in the `id` properties of the match patterns.
+
+| Name        | Description                         |
+| ----------- | ----------------------------------- |
+| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ |
+
+## EntityRuler.patterns {id="patterns",tag="property"}
+
+Get all patterns that were added to the entity ruler.
+
+| Name        | Description                                                                              |
+| ----------- | ---------------------------------------------------------------------------------------- |
+| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ |
+
+## Attributes {id="attributes"}
+
+| Name              | Description                                                                                                           |
+| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
+| `matcher`         | The underlying matcher used to process token patterns. ~~Matcher~~                                                    |
+| `phrase_matcher`  | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~                                      |
+| `token_patterns`  | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
+| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~                             |
diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx
index 97ae3c5e573..a491b182c6e 100644
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@@ -187,9 +187,9 @@ the data to and from a JSON file.
 
 > #### Real-world example
 >
-> To see custom serialization methods in action, check out the
-> [`SpanRuler`](/api/spanruler) component and its
-> [source](%%GITHUB_SPACY/spacy/pipeline/span_ruler.py). Patterns added to the
+> To see custom serialization methods in action, check out the new
+> [`EntityRuler`](/api/entityruler) component and its
+> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the
 > component will be saved to a `.jsonl` file if the pipeline is serialized to
 > disk, and to a bytestring if the pipeline is serialized to bytes. This allows
 > saving out a pipeline with rule-based components _with_ all the component

From 6e4257a5cbf9f74fe40aad1df316e2cdbf811299 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 22 Aug 2022 15:52:24 +0200
Subject: [PATCH 335/504] Cleanup Cython structs (#11337)

* cleanup Tokenizer fields

* remove unused object from vocab

* remove IS_OOV_DEPRECATED

* add back in as FLAG13

* FLAG 18 instead

* import fix

* fix clumpsy fingers

* revert symbol changes in favor of #11352

* bint instead of bool
---
 spacy/tokenizer.pyx | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 1fc5f310920..88cd0f37dd0 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -9,11 +9,17 @@ from preshed.maps cimport PreshMap
 
 import re
 
+from .tokens.doc cimport Doc
+from .strings cimport hash_string
 from .lexeme cimport EMPTY_LEXEME
 from .strings cimport hash_string
 from .tokens.doc cimport Doc
 
+from .attrs import intify_attrs
+from .symbols import ORTH, NORM
+from .errors import Errors
 from . import util
+from .util import get_words_and_spaces
 from .attrs import intify_attrs
 from .errors import Errors
 from .scorer import Scorer
@@ -142,8 +148,10 @@ cdef class Tokenizer:
     property faster_heuristics:
         def __get__(self):
             return self._faster_heuristics
+            return self._faster_heuristics
 
         def __set__(self, faster_heuristics):
+            self._faster_heuristics = faster_heuristics
             self._faster_heuristics = faster_heuristics
             self._reload_special_cases()
 >>>>>>> 5abfa8215 (Cleanup Cython structs (#11337))

From 040a18f21dd72add4cbcdf8078840cdcb82270a3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 22 Aug 2022 20:28:57 +0200
Subject: [PATCH 336/504] Make Span/Doc.ents more consistent for ent_kb_id and
 ent_id (#11328)

* Map `Span.id` to `Token.ent_id` in all cases when setting `Doc.ents`
* Reset `Token.ent_id` and `Token.ent_kb_id` when setting `Doc.ents`
* Make `Span.ent_id` an alias of `Span.id` rather than a read-only view
of the root token's `ent_id` annotation
---
 spacy/tests/doc/test_span.py               | 47 ----------------------
 spacy/tokens/span.pyi                      | 10 ++++-
 spacy/tokens/span.pyx                      | 16 +++++---
 website/docs/api/span.mdx                  |  1 +
 website/docs/api/token.mdx                 |  1 +
 website/docs/usage/rule-based-matching.mdx | 10 +++--
 6 files changed, 26 insertions(+), 59 deletions(-)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 8452a5152aa..15ea3614901 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -706,50 +706,3 @@ def test_span_ent_id(en_tokenizer):
     doc.ents = [span]
     assert doc.ents[0].ent_id_ == "ID2"
     assert doc[1].ent_id_ == "ID2"
-
-
-def test_span_start_end_sync(en_tokenizer):
-    doc = en_tokenizer("a bc def e fghij kl")
-    # can create and edit span starts/ends
-    span = doc[2:4]
-    span.start_char = 2
-    span.end = 5
-    assert span == doc[span.start : span.end]
-    assert span == doc.char_span(span.start_char, span.end_char)
-    # cannot set completely out of bounds starts/ends
-    with pytest.raises(IndexError):
-        span.start = -1
-    with pytest.raises(IndexError):
-        span.end = -1
-    with pytest.raises(IndexError):
-        span.start_char = len(doc.text) + 1
-    with pytest.raises(IndexError):
-        span.end = len(doc.text) + 1
-    # test all possible char starts/ends
-    span = doc[0 : len(doc)]
-    token_char_starts = [token.idx for token in doc]
-    token_char_ends = [token.idx + len(token.text) for token in doc]
-    for i in range(len(doc.text)):
-        if i not in token_char_starts:
-            with pytest.raises(ValueError):
-                span.start_char = i
-        else:
-            span.start_char = i
-    span = doc[0 : len(doc)]
-    for i in range(len(doc.text)):
-        if i not in token_char_ends:
-            with pytest.raises(ValueError):
-                span.end_char = i
-        else:
-            span.end_char = i
-    # start must be <= end
-    span = doc[1:3]
-    with pytest.raises(ValueError):
-        span.start = 4
-    with pytest.raises(ValueError):
-        span.end = 0
-    span = doc.char_span(2, 8)
-    with pytest.raises(ValueError):
-        span.start_char = 9
-    with pytest.raises(ValueError):
-        span.end_char = 1
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index f1030278c69..3b93ffdaa0b 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -134,13 +134,19 @@ class Span:
     @property
     def ent_id(self) -> int: ...
     @property
+    def label(self) -> int: ...
+    @property
+    def kb_id(self) -> int: ...
+    @property
+    def id(self) -> int: ...
+    @property
+    def ent_id(self) -> int: ...
+    @property
     def orth_(self) -> str: ...
     @property
     def lemma_(self) -> str: ...
     @property
     def label_(self) -> str: ...
-    @label_.setter
-    def label_(self, label: str): ...
     @property
     def kb_id_(self) -> str: ...
     @property
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index e84a15c7902..409180b4973 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -898,22 +898,22 @@ cdef class Span:
     property id:
         def __get__(self):
             return self.span_c().id
+            return self.span_c().id
 
         def __set__(self, attr_t id):
-            if id != self.span_c().id :
-                old_id = self.span_c().id
-                self.span_c().id = id
-                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
-                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id)
-                Underscore._replace_keys(old, new)
+            self.span_c().id = id
 
     property ent_id:
+        """Alias for the span's ID."""
         """Alias for the span's ID."""
         def __get__(self):
             return self.id
+            return self.id
 
         def __set__(self, attr_t ent_id):
             self.id = ent_id
+        def __set__(self, attr_t ent_id):
+            self.id = ent_id
 
     @property
     def orth_(self):
@@ -929,6 +929,7 @@ cdef class Span:
         return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
 
     property label_:
+        """The span's label."""
         """The span's label."""
         def __get__(self):
             return self.doc.vocab.strings[self.label]
@@ -938,6 +939,7 @@ cdef class Span:
         self.label = self.doc.vocab.strings.add(label_)
 
     property kb_id_:
+        """The span's KB ID."""
         """The span's KB ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.kb_id]
@@ -947,6 +949,7 @@ cdef class Span:
         self.kb_id = self.doc.vocab.strings.add(kb_id_)
 
     property id_:
+        """The span's ID."""
         """The span's ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.id]
@@ -964,6 +967,7 @@ cdef class Span:
             self.id_ = ent_id_
 
 
+
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
     # better candidates
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index cd70d8dcead..b1a9bea200e 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -567,4 +567,5 @@ overlaps with will be returned.
 | `ent_id_`                               | Alias for `id_`: the span's ID. ~~str~~                                                                                       |
 | `id`                                    | The hash value of the span's ID. ~~int~~                                                                                      |
 | `id_`                                   | The span's ID. ~~str~~                                                                                                        |
+| `sentiment`                             | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
 | `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
diff --git a/website/docs/api/token.mdx b/website/docs/api/token.mdx
index 16d421c12f4..12b99394350 100644
--- a/website/docs/api/token.mdx
+++ b/website/docs/api/token.mdx
@@ -470,6 +470,7 @@ The L2 norm of the token's vector representation.
 | `lang_`                                      | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
 | `prob`                                       | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
 | `idx`                                        | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
+| `sentiment`                                  | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
 | `lex_id`                                     | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `rank`                                       | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 8469d587ed1..2e5545f0df2 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1403,15 +1403,17 @@ ruler.add_patterns(patterns)
 
 doc1 = nlp("Apple is opening its first big office in San Francisco.")
 print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents])
+print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents])
 
 doc2 = nlp("Apple is opening its first big office in San Fran.")
 print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
+print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
 ```
 
-If the `id` attribute is included in the [`entity_ruler`](/api/entityruler)
-patterns, the `id_` property of the matched entity is set to the `id` given in
-the patterns. So in the example above it's easy to identify that "San Francisco"
-and "San Fran" are both the same entity.
+If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
+patterns, the `id_` property of the matched entity is set to the `id` given
+in the patterns. So in the example above it's easy to identify that "San
+Francisco" and "San Fran" are both the same entity.
 
 ### Using pattern files {id="entityruler-files"}
 

From 28f2630748e0be49edd4e63149e796c4677d4065 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 29 Aug 2022 13:23:24 +0200
Subject: [PATCH 337/504] Remove setup_requires from setup.cfg (#11384)

* Remove setup_requires from setup.cfg

* Update requirements test to ignore cython in setup.cfg
---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 55e6942622d..9b847bb57ab 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -29,7 +29,7 @@ project_urls =
 [options]
 zip_safe = false
 include_package_data = true
-python_requires = >=3.8
+python_requires = >=3.6
 install_requires =
     # Our libraries
     spacy-legacy>=4.0.0.dev1,<4.1.0

From fe0b9b753f939d1f7d486ed6adcf6121ff15de84 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 30 Aug 2022 13:56:35 +0200
Subject: [PATCH 338/504] Make stable private modules public and adjust names
 (#11353)

* Make stable private modules public and adjust names

* `spacy.ml._character_embed` -> `spacy.ml.character_embed`
* `spacy.ml._precomputable_affine` -> `spacy.ml.precomputable_affine`
* `spacy.tokens._serialize` -> `spacy.tokens.doc_bin`
* `spacy.tokens._retokenize` -> `spacy.tokens.retokenize`
* `spacy.tokens._dict_proxies` -> `spacy.tokens.span_groups`

* Skip _precomputable_affine

* retokenize -> retokenizer

* Fix imports
---
 spacy/ml/models/tok2vec.py        |  3 +++
 spacy/pipeline/attribute_ruler.py |  2 +-
 spacy/tokens/__init__.py          |  3 ++-
 spacy/tokens/doc.pyi              |  7 +++++--
 spacy/tokens/doc.pyx              | 15 ++++++++++++++-
 spacy/tokens/doc_bin.py           |  4 +---
 6 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 61bc7291e2e..9372a665f2c 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -22,6 +22,8 @@
 from ...attrs import intify_attr
 from ...errors import Errors
 from ...ml import character_embed
+from ..staticvectors import StaticVectors
+from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
 from ...tokens import Doc
 from ...util import registry
@@ -241,6 +243,7 @@ def CharacterEmbed(
     if feature is None:
         raise ValueError(Errors.E911.format(feat=feature))
     char_embed = chain(
+        character_embed.CharacterEmbed(nM=nM, nC=nC),
         character_embed.CharacterEmbed(nM=nM, nC=nC),
         cast(Model[List[Floats2d], Ragged], list2ragged()),
     )
diff --git a/spacy/pipeline/attribute_ruler.py b/spacy/pipeline/attribute_ruler.py
index 76f82b84e38..126a48945bc 100644
--- a/spacy/pipeline/attribute_ruler.py
+++ b/spacy/pipeline/attribute_ruler.py
@@ -11,7 +11,7 @@
 from ..symbols import IDS
 from ..tokens import Doc, Span
 from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs
-from ..training import Example
+from ..vocab import Vocab
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
 from .pipe import Pipe
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index 7617e462fde..16c43485340 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -4,6 +4,7 @@
 from .morphanalysis import MorphAnalysis
 from .span import Span
 from .span_group import SpanGroup
-from .token import Token
+from .doc_bin import DocBin
+from .morphanalysis import MorphAnalysis
 
 __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index dc7c0143029..1304a8aae8d 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -15,8 +15,11 @@ from typing import (
 
 import numpy as np
 from cymem.cymem import Pool
-from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
-
+from thinc.types import Floats1d, Floats2d, Ints2d
+from .span import Span
+from .token import Token
+from .span_groups import SpanGroups
+from .retokenizer import Retokenizer
 from ..lexeme import Lexeme
 from ..vocab import Vocab
 from .retokenizer import Retokenizer
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 4b8a15a65fd..169199bc563 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -20,8 +20,15 @@ from thinc.util import copy_array
 
 from .span cimport Span
 from .token cimport MISSING_DEP
-
 from .span_groups import SpanGroups
+from .token cimport Token
+from ..lexeme cimport Lexeme, EMPTY_LEXEME
+from ..typedefs cimport attr_t, flags_t
+from ..attrs cimport attr_id_t
+from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
+from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
+
+from ._dict_proxies import SpanGroups
 
 from ..attrs cimport (
     DEP,
@@ -50,6 +57,12 @@ from ..attrs import IDS, intify_attr
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
 from ..morphology import Morphology
+from .. import util
+from .. import parts_of_speech
+from .. import schemas
+from .underscore import Underscore, get_ext_args
+from .retokenizer import Retokenizer
+from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from ..util import get_words_and_spaces
 from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from .retokenizer import Retokenizer
diff --git a/spacy/tokens/doc_bin.py b/spacy/tokens/doc_bin.py
index 4dda40a05ee..8a08864d46e 100644
--- a/spacy/tokens/doc_bin.py
+++ b/spacy/tokens/doc_bin.py
@@ -10,9 +10,7 @@
 from ..attrs import IDS, ORTH, SPACY, intify_attr
 from ..compat import copy_reg
 from ..errors import Errors
-from ..util import SimpleFrozenList, ensure_path
-from ..vocab import Vocab
-from .doc import Doc
+from ..util import ensure_path, SimpleFrozenList
 from .span_groups import SpanGroups
 
 # fmt: off

From 02bb6474d87899abd181a3a163c2b87ffde33ccc Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 30 Aug 2022 22:40:31 +0900
Subject: [PATCH 339/504] Update/remove old Matcher syntax (#11370)

* Clean up old Matcher call style related stuff

In v2 Matcher.add was called with (key, on_match, *patterns). In v3 this
was changed to (key, patterns, *, on_match=None), but there were various
points where the old call syntax was documented or handled specially.
This removes all those.

The Matcher itself didn't need any code changes, as it just gives a
generic type error. However the PhraseMatcher required some changes
because it would automatically "fix" the old call style.

Surprisingly, the tokenizer was still using the old call style in one
place.

After these changes tests failed in two places:

1. one test for the "new" call style, including the "old" call style. I
   removed this test.
2. deserializing the PhraseMatcher fails because the input docs are a
   set.

I am not sure why 2 is happening - I guess it's a quirk of the
serialization format? - so for now I just convert the set to a list when
deserializing. The check that the input Docs are a List in the
PhraseMatcher is a new check, but makes it parallel with the other
Matchers, which seemed like the right thing to do.

* Add notes related to input docs / deserialization type

* Remove Typing import

* Remove old note about call style change

* Apply suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Use separate method for setting internal doc representations

In addition to the title change, this changes the internal dict to be a
defaultdict, instead of a dict with frequent use of setdefault.

* Add _add_from_arrays for unpickling

* Cleanup around adding from arrays

This moves adding to internal structures into the private batch method,
and removes the single-add method.

This has one behavioral change for `add`, in that if something is wrong
with the list of input Docs (such as one of the items not being a Doc),
valid items before the invalid one will not be added. Also the callback
will not be updated if anything is invalid. This change should not be
significant.

This also adds a test to check failure when given a non-Doc.

* Update spacy/matcher/phrasematcher.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/errors.py                 | 35 ++++++++++-----------------------
 spacy/matcher/phrasematcher.pyx | 15 +++++++++++---
 2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 1af8a3b0891..6bb7d992954 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -498,6 +498,7 @@ class Errors(metaclass=ErrorsWithCodes):
             "Current DocBin: {current}\nOther DocBin: {other}")
     E169 = ("Can't find module: {module}")
     E170 = ("Cannot apply transition {name}: invalid for the current state.")
+    E171 = ("{name}.add received invalid 'on_match' callback argument: expected "
     E171 = ("{name}.add received invalid 'on_match' callback argument: expected "
             "callable or None, but got: {arg_type}")
     E175 = ("Can't remove rule for unknown match pattern ID: {key}")
@@ -750,6 +751,7 @@ class Errors(metaclass=ErrorsWithCodes):
             "loaded nlp object, but got: {source}")
     E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
             "a string value from {expected} but got: '{arg}'")
+    E948 = ("`{name}.add` received invalid 'patterns' argument: expected "
     E948 = ("`{name}.add` received invalid 'patterns' argument: expected "
             "a list, but got: {arg_type}")
     E949 = ("Unable to align tokens for the predicted and reference docs. It "
@@ -980,33 +982,16 @@ class Errors(metaclass=ErrorsWithCodes):
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
-    E4001 = ("Expected input to be one of the following types: ({expected_types}), "
-             "but got '{received_type}'")
-    E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
-    E4003 = ("Training examples for distillation must have the exact same tokens in the "
-             "reference and predicted docs.")
-    E4004 = ("Backprop is not supported when is_train is not set.")
-    E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
-    E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
-    E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
-             "{existing_value}.")
-    E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
-    E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
-    E4010 = ("Required lemmatizer table(s) {missing_tables} not found in "
-             "[initialize] or in registered lookups (spacy-lookups-data). An "
-             "example for how to load lemmatizer tables in [initialize]:\n\n"
-             "[initialize.components]\n\n"
-             "[initialize.components.{pipe_name}]\n\n"
-             "[initialize.components.{pipe_name}.lookups]\n"
-             '@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
-             "lang = ${{nlp.lang}}\n"
-             f'url = "{about.__lookups_url__}"\n'
-             "tables = {tables}\n"
-             "# or required tables only: tables = {required_tables}\n")
-    E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
-RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
+# Deprecated model shortcuts, only used in errors and warnings
+OLD_MODEL_SHORTCUTS = {
+    "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
+    "pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
+    "nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
+    "lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
+}
+
 
 # fmt: on
 
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index d1a8eaf33c4..f36c93e8f32 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,9 +1,8 @@
 # cython: infer_types=True, profile=True
-from collections import defaultdict
 from typing import List
-
+from collections import defaultdict
 from libc.stdint cimport uintptr_t
-from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
+from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
 
 import warnings
 
@@ -45,6 +44,7 @@ cdef class PhraseMatcher:
         self.vocab = vocab
         self._callbacks = {}
         self._docs = defaultdict(set)
+        self._docs = defaultdict(set)
         self._validate = validate
 
         self.mem = Pool()
@@ -160,22 +160,29 @@ cdef class PhraseMatcher:
         del self._callbacks[key]
         del self._docs[key]
 
+
     def _add_from_arrays(self, key, specs, *, on_match=None):
         """Add a preprocessed list of specs, with an optional callback.
 
         key (str): The match ID.
         specs (List[List[int]]): A list of lists of hashes to match.
+        specs (List[List[int]]): A list of lists of hashes to match.
         on_match (callable): Callback executed on match.
         """
+        """
         cdef MapStruct* current_node
         cdef MapStruct* internal_node
         cdef void* result
 
+        self._callbacks[key] = on_match
+        for spec in specs:
+            self._docs[key].add(tuple(spec))
         self._callbacks[key] = on_match
         for spec in specs:
             self._docs[key].add(tuple(spec))
 
             current_node = self.c_map
+            for token in spec:
             for token in spec:
                 if token == self._terminal_hash:
                     warnings.warn(Warnings.W021)
@@ -195,6 +202,7 @@ cdef class PhraseMatcher:
                 result = internal_node
             map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
 
+
     def add(self, key, docs, *, on_match=None):
         """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
         key, a list of one or more patterns, and (optionally) an on_match callback.
@@ -358,6 +366,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr):
     for key, specs in docs.items():
         callback = callbacks.get(key, None)
         matcher._add_from_arrays(key, specs, on_match=callback)
+        matcher._add_from_arrays(key, specs, on_match=callback)
     return matcher
 
 

From 67dee50fa80e83a026718be628ebc03263d45243 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 2 Sep 2022 09:08:40 +0200
Subject: [PATCH 340/504] Consolidate and freeze symbols (#11352)

* Consolidate and freeze symbols

Instead of having symbol values defined in three potentially conflicting
places (`spacy.attrs`, `spacy.parts_of_speech`, `spacy.symbols`), define
all symbols in `spacy.symbols` and reference those values in
`spacy.attrs` and `spacy.parts_of_speech`.

Remove deprecated and placeholder symbols from `spacy.attrs.IDS`.

Make `spacy.attrs.NAMES` and `spacy.symbols.NAMES` reverse dicts rather
than lists in order to support future use of hash values in `attr_id_t`.

Minor changes:

* Use `uint64_t` for attrs in `Doc.to_array` to support future use of
hash values
* Remove unneeded attrs filter for error message in `Doc.to_array`
* Remove unused attr `SENT_END`

* Handle dynamic size of attr_id_t in Doc.to_array

* Undo added warnings

* Refactor to make Doc.to_array more similar to Doc.from_array

* Improve refactoring
---
 spacy/parts_of_speech.pxd   |  2 +-
 spacy/strings.pyx           | 57 +++++++++++++++++++++----------------
 spacy/tests/test_symbols.py |  1 -
 3 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index 22a571be7b0..01f116ea688 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -8,7 +8,7 @@ cpdef enum univ_pos_t:
     ADV = symbols.ADV
     AUX = symbols.AUX
     CONJ = symbols.CONJ
-    CCONJ = symbols.CCONJ  # U20
+    CCONJ  = symbols.CCONJ  # U20
     DET = symbols.DET
     INTJ = symbols.INTJ
     NOUN = symbols.NOUN
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 28e06a2ecea..a80985f6ff2 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -32,16 +32,34 @@ cdef class StringStore:
             for string in strings:
                 self.add(string)
 
-    def __getitem__(self, string_or_hash: Union[str, int]) -> Union[str, int]:
-        """Retrieve a string from a given hash. If a string
-        is passed as the input, add it to the store and return
-        its hash.
+    def __getitem__(self, object string_or_id):
+        """Retrieve a string from a given hash, or vice versa.
 
-        string_or_hash (int / str): The hash value to lookup or the string to store.
-        RETURNS (str / int): The stored string or the hash of the newly added string.
+        string_or_id (bytes, str or uint64): The value to encode.
+        Returns (str / uint64): The value to be retrieved.
         """
-        if isinstance(string_or_hash, str):
-            return self.add(string_or_hash)
+        cdef hash_t str_hash
+        cdef Utf8Str* utf8str = NULL
+
+        if isinstance(string_or_id, str):
+            if len(string_or_id) == 0:
+                return 0
+
+            # Return early if the string is found in the symbols LUT.
+            symbol = SYMBOLS_BY_STR.get(string_or_id, None)
+            if symbol is not None:
+                return symbol
+            else:
+                return hash_string(string_or_id)
+        elif isinstance(string_or_id, bytes):
+            return hash_utf8(string_or_id, len(string_or_id))
+        elif _try_coerce_to_hash(string_or_id, &str_hash):
+            if str_hash == 0:
+                return ""
+            elif str_hash in SYMBOLS_BY_INT:
+                return SYMBOLS_BY_INT[str_hash]
+            else:
+                utf8str = <Utf8Str*>self._map.get(str_hash)
         else:
             return self._get_interned_str(string_or_hash)
 
@@ -111,24 +129,13 @@ cdef class StringStore:
         if isinstance(string_or_hash, str):
             return string_or_hash
         else:
-            return self._get_interned_str(string_or_hash)
+            # TODO: Raise an error instead
+            return self._map.get(string_or_id) is not NULL
 
-    def items(self) -> List[Tuple[str, int]]:
-        """Iterate over the stored strings and their hashes in insertion order.
-
-        RETURNS: A list of string-hash pairs.
-        """
-        # Even though we internally store the hashes as keys and the strings as
-        # values, we invert the order in the public API to keep it consistent with
-        # the implementation of the `__iter__` method (where we wish to iterate over
-        # the strings in the store).
-        cdef int i
-        pairs = [None] * self._keys.size()
-        for i in range(self._keys.size()):
-            str_hash = self._keys[i]
-            utf8str = <Utf8Str*>self._map.get(str_hash)
-            pairs[i] = (self._decode_str_repr(utf8str), str_hash)
-        return pairs
+        if str_hash in SYMBOLS_BY_INT:
+            return True
+        else:
+            return self._map.get(str_hash) is not NULL
 
     def keys(self) -> List[str]:
         """Iterate over the stored strings in insertion order.
diff --git a/spacy/tests/test_symbols.py b/spacy/tests/test_symbols.py
index 2c2fcef755e..fb034accac2 100644
--- a/spacy/tests/test_symbols.py
+++ b/spacy/tests/test_symbols.py
@@ -1,5 +1,4 @@
 import pytest
-
 from spacy.symbols import IDS, NAMES
 
 V3_SYMBOLS = {

From 8d6eccf64951aed75cde130544ebe71354b65358 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 13 Sep 2022 09:51:12 +0200
Subject: [PATCH 341/504] Store activations in `Doc`s when `save_activations`
 is enabled (#11002)

* Store activations in Doc when `store_activations` is enabled

This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.

As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.

* Change type of `store_activations` to `Union[bool, List[str]]`

When the value is:

- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored

* Formatting fixes in Tagger

* Support store_activations in spancat and morphologizer

* Make Doc.activations type visible to MyPy

* textcat/textcat_multilabel: add store_activations option

* trainable_lemmatizer/entity_linker: add store_activations option

* parser/ner: do not currently support returning activations

* Extend tagger and senter tests

So that they, like the other tests, also check that we get no
activations if no activations were requested.

* Document `Doc.activations` and `store_activations` in the relevant pipes

* Start errors/warnings at higher numbers to avoid merge conflicts

Between the master and v4 branches.

* Add `store_activations` to docstrings.

* Replace store_activations setter by set_store_activations method

Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.

* Use dict comprehension suggested by @svlandeg

* Revert "Use dict comprehension suggested by @svlandeg"

This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.

* EntityLinker: add type annotations to _add_activations

* _store_activations: make kwarg-only, remove doc_scores_lens arg

* set_annotations: add type annotations

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* TextCat.predict: return dict

* Make the `TrainablePipe.store_activations` property a bool

This means that we can also bring back `store_activations` setter.

* Remove `TrainablePipe.activations`

We do not need to enumerate the activations anymore since `store_activations` is
`bool`.

* Add type annotations for activations in predict/set_annotations

* Rename `TrainablePipe.store_activations` to `save_activations`

* Error E1400 is not used anymore

This error was used when activations were still `Union[bool, List[str]]`.

* Change wording in API docs after store -> save change

* docs: tag (save_)activations as new in spaCy 4.0

* Fix copied line in morphologizer activations test

* Don't train in any test_save_activations test

* Rename activations

- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
  "guesses" -> "tree_ids".

* Remove unused W400 warning.

This warning was used when we still allowed the user to specify
which activations to save.

* Formatting fixes

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Replace "kb_ids" by a constant

* spancat: replace a cast by an assertion

* Fix EOF spacing

* Fix comments in test_save_activations tests

* Do not set RNG seed in activation saving tests

* Revert "spancat: replace a cast by an assertion"

This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/edit_tree_lemmatizer.py        |  45 ++---
 spacy/pipeline/entity_linker.py               | 167 +++++++++++-------
 spacy/pipeline/morphologizer.pyx              |  30 ++++
 spacy/pipeline/senter.pyx                     |  31 +++-
 spacy/pipeline/spancat.py                     |  37 ++--
 spacy/pipeline/tagger.pyx                     |  34 +++-
 spacy/pipeline/textcat.py                     |   4 +
 spacy/pipeline/textcat_multilabel.py          |  18 +-
 spacy/pipeline/trainable_pipe.pyx             |   6 +-
 .../pipeline/test_edit_tree_lemmatizer.py     |  25 +++
 spacy/tests/pipeline/test_entity_linker.py    |   8 +-
 spacy/tests/pipeline/test_morphologizer.py    |  25 ++-
 spacy/tests/pipeline/test_senter.py           |  25 ++-
 spacy/tests/pipeline/test_tagger.py           |  49 +----
 spacy/tests/pipeline/test_textcat.py          |  48 +++--
 spacy/tokens/doc.pxd                          |   2 +
 spacy/tokens/doc.pyi                          |   3 +-
 website/docs/api/doc.mdx                      |   1 +
 website/docs/api/entitylinker.mdx             |   2 +-
 website/docs/api/morphologizer.mdx            |   6 +-
 20 files changed, 388 insertions(+), 178 deletions(-)

diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 1a29735e8e8..1f16b44cfed 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
@@ -18,6 +18,10 @@
 from .lemmatizer import lemmatizer_score
 from .trainable_pipe import TrainablePipe
 
+# The cutoff value of *top_k* above which an alternative method is used to process guesses.
+TOP_K_GUARDRAIL = 20
+
+
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
 
@@ -50,6 +54,7 @@
         "top_k": 1,
         "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
         "save_activations": False,
+        "save_activations": False,
     },
     default_score_weights={"lemma_acc": 1.0},
 )
@@ -63,6 +68,7 @@ def make_edit_tree_lemmatizer(
     top_k: int,
     scorer: Optional[Callable],
     save_activations: bool,
+    save_activations: bool,
 ):
     """Construct an EditTreeLemmatizer component."""
     return EditTreeLemmatizer(
@@ -75,6 +81,7 @@ def make_edit_tree_lemmatizer(
         top_k=top_k,
         scorer=scorer,
         save_activations=save_activations,
+        save_activations=save_activations,
     )
 
 
@@ -95,6 +102,7 @@ def __init__(
         top_k: int = 1,
         scorer: Optional[Callable] = lemmatizer_score,
         save_activations: bool = False,
+        save_activations: bool = False,
     ):
         """
         Construct an edit tree lemmatizer.
@@ -107,6 +115,7 @@ def __init__(
         overwrite (bool): overwrite existing lemma annotations.
         top_k (int): try to apply at most the k most probable edit trees.
         save_activations (bool): save model activations in Doc when annotating.
+        save_activations (bool): save model activations in Doc when annotating.
         """
         self.vocab = vocab
         self.model = model
@@ -122,6 +131,7 @@ def __init__(
         self.cfg: Dict[str, Any] = {"labels": []}
         self.scorer = scorer
         self.save_activations = save_activations
+        self.save_activations = save_activations
 
     def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
@@ -150,25 +160,6 @@ def get_loss(
 
         return float(loss), d_scores
 
-    def get_teacher_student_loss(
-        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
-    ) -> Tuple[float, List[Floats2d]]:
-        """Calculate the loss and its gradient for a batch of student
-        scores, relative to teacher scores.
-
-        teacher_scores: Scores representing the teacher model's predictions.
-        student_scores: Scores representing the student model's predictions.
-
-        RETURNS (Tuple[float, float]): The loss and the gradient.
-
-        DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
-        """
-        loss_func = SequenceCategoricalCrossentropy(normalize=False)
-        d_scores, loss = loss_func(student_scores, teacher_scores)
-        if self.model.ops.xp.isnan(loss):
-            raise ValueError(Errors.E910.format(name=self.name))
-        return float(loss), d_scores
-
     def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         n_docs = len(list(docs))
         if not any(len(doc) for doc in docs):
@@ -180,13 +171,21 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
             scores: List[Floats2d] = [
                 self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
             ]
+            guesses: List[Ints1d] = [
+                self.model.ops.alloc((0,), dtype="i") for doc in docs
+            ]
+            scores: List[Floats2d] = [
+                self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
+            ]
             assert len(guesses) == n_docs
             return {"probabilities": scores, "tree_ids": guesses}
+            return {"probabilities": scores, "tree_ids": guesses}
         scores = self.model.predict(docs)
         assert len(scores) == n_docs
         guesses = scores2guesses(docs, scores)
         assert len(guesses) == n_docs
         return {"probabilities": scores, "tree_ids": guesses}
+        return {"probabilities": scores, "tree_ids": guesses}
 
     def _scores2guesses_top_k_equals_1(self, docs, scores):
         guesses = []
@@ -246,9 +245,15 @@ def _scores2guesses_top_k_guardrail(self, docs, scores):
 
         return guesses
 
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
+        batch_tree_ids = activations["tree_ids"]
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         batch_tree_ids = activations["tree_ids"]
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             if self.save_activations:
                 doc.activations[self.name] = {}
                 for act_name, acts in activations.items():
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 629a5f193aa..2716d3821e2 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,3 +1,10 @@
+from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
+from typing import cast
+from numpy import dtype
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
+from pathlib import Path
+from itertools import islice
+import srsly
 import random
 import warnings
 from itertools import islice
@@ -21,10 +28,14 @@
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 
+
 ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
 
 KNOWLEDGE_BASE_IDS = "kb_ids"
 
+# See #9050
+BACKWARD_OVERWRITE = True
+
 default_model_config = """
 [model]
 @architectures = "spacy.EntityLinker.v2"
@@ -61,6 +72,7 @@
         "candidates_batch_size": 1,
         "threshold": None,
         "save_activations": False,
+        "save_activations": False,
     },
     default_score_weights={
         "nel_micro_f": 1.0,
@@ -89,6 +101,7 @@ def make_entity_linker(
     candidates_batch_size: int,
     threshold: Optional[float] = None,
     save_activations: bool,
+    save_activations: bool,
 ):
     """Construct an EntityLinker component.
 
@@ -113,6 +126,7 @@ def make_entity_linker(
     threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
         prediction is discarded. If None, predictions are not filtered by any threshold.
     save_activations (bool): save model activations in Doc when annotating.
+    save_activations (bool): save model activations in Doc when annotating.
     """
     if not model.attrs.get("include_span_maker", False):
         raise ValueError(Errors.E4005)
@@ -135,6 +149,7 @@ def make_entity_linker(
         candidates_batch_size=candidates_batch_size,
         threshold=threshold,
         save_activations=save_activations,
+        save_activations=save_activations,
     )
 
 
@@ -176,6 +191,7 @@ def __init__(
         candidates_batch_size: int,
         threshold: Optional[float] = None,
         save_activations: bool = False,
+        save_activations: bool = False,
     ) -> None:
         """Initialize an entity linker.
 
@@ -230,6 +246,7 @@ def __init__(
         self.candidates_batch_size = candidates_batch_size
         self.threshold = threshold
         self.save_activations = save_activations
+        self.save_activations = save_activations
 
         if candidates_batch_size < 1:
             raise ValueError(Errors.E1044)
@@ -437,6 +454,7 @@ def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
         loss = loss / len(entity_encodings)
         return float(loss), out
 
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
     def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
         Returns the KB IDs for each entity in each doc, including NIL if there is
@@ -454,38 +472,48 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         xp = ops.xp
         docs_ents: List[Ragged] = []
         docs_scores: List[Ragged] = []
+        ops = self.model.ops
+        xp = ops.xp
+        docs_ents: List[Ragged] = []
+        docs_scores: List[Ragged] = []
         if not docs:
-            return {
-                KNOWLEDGE_BASE_IDS: final_kb_ids,
-                "ents": docs_ents,
-                "scores": docs_scores,
-            }
+            return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
         if isinstance(docs, Doc):
             docs = [docs]
+        for doc in docs:
+            doc_ents: List[Ints1d] = []
+            doc_scores: List[Floats1d] = []
         for doc in docs:
             doc_ents: List[Ints1d] = []
             doc_scores: List[Floats1d] = []
             if len(doc) == 0:
+                docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
+                docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
                 docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
                 docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
                 continue
             sentences = [s for s in doc.sents]
 
-            # Loop over entities in batches.
-            for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
-                ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
-
-                # Look up candidate entities.
-                valid_ent_idx = [
-                    idx
-                    for idx in range(len(ent_batch))
-                    if ent_batch[idx].label_ not in self.labels_discard
-                ]
-
-                batch_candidates = list(
-                    self.get_candidates_batch(
-                        self.kb,
-                        SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
+                if self.incl_context:
+                    # get n_neighbour sentences, clipped to the length of the document
+                    start_sentence = max(0, sent_index - self.n_sents)
+                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
+                    start_token = sentences[start_sentence].start
+                    end_token = sentences[end_sentence].end
+                    sent_doc = doc[start_token:end_token].as_doc()
+                    # currently, the context is the same for each entity in a sentence (should be refined)
+                    sentence_encoding = self.model.predict([sent_doc])[0]
+                    sentence_encoding_t = sentence_encoding.T
+                    sentence_norm = xp.linalg.norm(sentence_encoding_t)
+                entity_count += 1
+                if ent.label_ in self.labels_discard:
+                    # ignoring this entity - setting to NIL
+                    final_kb_ids.append(self.NIL)
+                    self._add_activations(
+                        doc_scores=doc_scores,
+                        doc_ents=doc_ents,
+                        scores=[0.0],
+                        ents=[0],
                     )
                 else:
                     candidates = list(self.get_candidates(self.kb, ent))
@@ -519,51 +547,17 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                             entity_encodings = xp.asarray(
                                 [c.entity_vector for c in candidates]
                             )
-                        elif len(candidates) == 1 and self.threshold is None:
-                            # shortcut for efficiency reasons: take the 1 candidate
-                            final_kb_ids.append(candidates[0].entity_id_)
-                            self._add_activations(
-                                doc_scores=doc_scores,
-                                doc_ents=doc_ents,
-                                scores=[1.0],
-                                ents=[candidates[0].entity_id],
-                            )
-                        else:
-                            random.shuffle(candidates)
-                            # set all prior probabilities to 0 if incl_prior=False
-                            if self.incl_prior and self.kb.supports_prior_probs:
-                                prior_probs = xp.asarray([c.prior_prob for c in candidates])  # type: ignore
-                            else:
-                                prior_probs = xp.asarray([0.0 for _ in candidates])
-                            scores = prior_probs
-                            # add in similarity from the context
-                            if self.incl_context:
-                                entity_encodings = xp.asarray(
-                                    [c.entity_vector for c in candidates]
-                                )
-                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
-                                if len(entity_encodings) != len(prior_probs):
-                                    raise RuntimeError(
-                                        Errors.E147.format(
-                                            method="predict",
-                                            msg="vectors not of equal length",
-                                        )
+                            entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+                            if len(entity_encodings) != len(prior_probs):
+                                raise RuntimeError(
+                                    Errors.E147.format(
+                                        method="predict",
+                                        msg="vectors not of equal length",
                                     )
                                 )
-                                if sims.shape != prior_probs.shape:
-                                    raise ValueError(Errors.E161)
-                                scores = prior_probs + sims - (prior_probs * sims)
-                            final_kb_ids.append(
-                                candidates[scores.argmax().item()].entity_id_
-                                if self.threshold is None
-                                or scores.max() >= self.threshold
-                                else EntityLinker.NIL
-                            )
-                            self._add_activations(
-                                doc_scores=doc_scores,
-                                doc_ents=doc_ents,
-                                scores=scores,
-                                ents=[c.entity_id for c in candidates],
+                            # cosine similarity
+                            sims = xp.dot(entity_encodings, sentence_encoding_t) / (
+                                sentence_norm * entity_norm
                             )
                             if sims.shape != prior_probs.shape:
                                 raise ValueError(Errors.E161)
@@ -590,27 +584,35 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                 method="predict", msg="result variables not of equal length"
             )
             raise RuntimeError(err)
-        return {
-            KNOWLEDGE_BASE_IDS: final_kb_ids,
-            "ents": docs_ents,
-            "scores": docs_scores,
-        }
+        return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
 
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
         activations (ActivationsT): The activations used for setting annotations, produced
                                  by EntityLinker.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced
+                                 by EntityLinker.predict.
 
         DOCS: https://spacy.io/api/entitylinker#set_annotations
         """
         kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
+        kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
         count_ents = len([ent for doc in docs for ent in doc.ents])
         if count_ents != len(kb_ids):
             raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
         i = 0
         overwrite = self.cfg["overwrite"]
+        for j, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    if act_name != KNOWLEDGE_BASE_IDS:
+                        # We only copy activations that are Ragged.
+                        doc.activations[self.name][act_name] = cast(Ragged, acts[j])
+
         for j, doc in enumerate(docs):
             if self.save_activations:
                 doc.activations[self.name] = {}
@@ -746,3 +748,32 @@ def _add_activations(
         ops = self.model.ops
         doc_scores.append(ops.asarray1f(scores))
         doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
+
+    def _add_doc_activations(
+        self,
+        *,
+        docs_scores: List[Ragged],
+        docs_ents: List[Ragged],
+        doc_scores: List[Floats1d],
+        doc_ents: List[Ints1d],
+    ):
+        if not self.save_activations:
+            return
+        ops = self.model.ops
+        lengths = ops.asarray1i([s.shape[0] for s in doc_scores])
+        docs_scores.append(Ragged(ops.flatten(doc_scores), lengths))
+        docs_ents.append(Ragged(ops.flatten(doc_ents), lengths))
+
+    def _add_activations(
+        self,
+        *,
+        doc_scores: List[Floats1d],
+        doc_ents: List[Ints1d],
+        scores: Sequence[float],
+        ents: Sequence[int],
+    ):
+        if not self.save_activations:
+            return
+        ops = self.model.ops
+        doc_scores.append(ops.asarray1f(scores))
+        doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 43e36b36844..c26be7912a9 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,4 +1,8 @@
 # cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Dict, Iterable, List, Optional, Union
+import srsly
+from thinc.api import SequenceCategoricalCrossentropy, Model, Config
+from thinc.types import Floats2d, Ints1d
 from itertools import islice
 from typing import Callable, Dict, Iterable, Optional, Union
 
@@ -8,6 +12,12 @@ from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 
+from ..parts_of_speech import IDS as POS_IDS
+from ..symbols import POS
+from ..language import Language
+from ..errors import Errors
+from .pipe import deserialize_config
+from .tagger import ActivationsT, Tagger
 from .. import util
 from ..errors import Errors
 from ..language import Language
@@ -58,6 +68,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
         "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
         "save_activations": False,
     },
+    default_config={
+        "model": DEFAULT_MORPH_MODEL,
+        "overwrite": True,
+        "extend": False,
+        "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
+        "save_activations": False,
+    },
     default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
@@ -69,9 +86,12 @@ def make_morphologizer(
     label_smoothing: float,
     scorer: Optional[Callable],
     save_activations: bool,
+    save_activations: bool,
 ):
     return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
                          save_activations=save_activations)
+    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
+                         save_activations=save_activations)
 
 
 def morphologizer_score(examples, **kwargs):
@@ -107,6 +127,7 @@ class Morphologizer(Tagger):
         extend: bool = False,
         scorer: Optional[Callable] = morphologizer_score,
         save_activations: bool = False,
+        save_activations: bool = False,
     ):
         """Initialize a morphologizer.
 
@@ -120,6 +141,7 @@ class Morphologizer(Tagger):
             Scorer.score_token_attr for the attributes "pos" and "morph" and
             Scorer.score_token_attr_per_feat for the attribute "morph".
         save_activations (bool): save model activations in Doc when annotating.
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/morphologizer#init
         """
@@ -141,6 +163,7 @@ class Morphologizer(Tagger):
         self.cfg = dict(sorted(cfg.items()))
         self.scorer = scorer
         self.save_activations = save_activations
+        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -234,15 +257,18 @@ class Morphologizer(Tagger):
         assert len(label_sample) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=doc_sample, Y=label_sample)
 
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
         activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
 
         DOCS: https://spacy.io/api/morphologizer#set_annotations
         """
         batch_tag_ids = activations["label_ids"]
+        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
@@ -253,6 +279,10 @@ class Morphologizer(Tagger):
         # to allocate a compatible container out of the iterable.
         labels = tuple(self.labels)
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             if self.save_activations:
                 doc.activations[self.name] = {}
                 for act_name, acts in activations.items():
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 51670dcf8cf..dd56b4a62e6 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,12 +1,16 @@
 # cython: infer_types=True, profile=True, binding=True
+from typing import Dict, Iterable, Optional, Callable, List, Union
 from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Union
 
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+import srsly
+from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
 
-from .. import util
+from .tagger import ActivationsT, Tagger
+from ..language import Language
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
@@ -40,6 +44,12 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
         "scorer": {"@scorers": "spacy.senter_scorer.v1"},
         "save_activations": False,
     },
+    default_config={
+        "model": DEFAULT_SENTER_MODEL,
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.senter_scorer.v1"},
+        "save_activations": False,
+    },
     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
 def make_senter(nlp: Language,
@@ -49,6 +59,13 @@ def make_senter(nlp: Language,
                 scorer: Optional[Callable],
                 save_activations: bool):
     return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
+def make_senter(nlp: Language,
+                name: str,
+                model: Model,
+                overwrite: bool,
+                scorer: Optional[Callable],
+                save_activations: bool):
+    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
 
 
 def senter_score(examples, **kwargs):
@@ -79,6 +96,7 @@ class SentenceRecognizer(Tagger):
         overwrite=False,
         scorer=senter_score,
         save_activations: bool = False,
+        save_activations: bool = False,
     ):
         """Initialize a sentence recognizer.
 
@@ -90,6 +108,7 @@ class SentenceRecognizer(Tagger):
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_spans for the attribute "sents".
         save_activations (bool): save model activations in Doc when annotating.
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/sentencerecognizer#init
         """
@@ -100,6 +119,7 @@ class SentenceRecognizer(Tagger):
         self.cfg = {"overwrite": overwrite}
         self.scorer = scorer
         self.save_activations = save_activations
+        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -117,20 +137,27 @@ class SentenceRecognizer(Tagger):
     def label_data(self):
         return None
 
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
         activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
 
         DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
         """
         batch_tag_ids = activations["label_ids"]
+        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
         cdef bint overwrite = self.cfg["overwrite"]
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             if self.save_activations:
                 doc.activations[self.name] = {}
                 for act_name, acts in activations.items():
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 72fd78f461e..d800a4d484b 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,18 +1,8 @@
-from dataclasses import dataclass
-from functools import partial
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Protocol,
-    Tuple,
-    Union,
-    cast,
-    runtime_checkable,
-)
+from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
+from typing import Union
+from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
+from thinc.api import Optimizer
+from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
 
 import numpy
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
@@ -36,6 +26,9 @@
 ActivationsT = Dict[str, Union[Floats2d, Ragged]]
 
 
+ActivationsT = Dict[str, Union[Floats2d, Ragged]]
+
+
 spancat_default_config = """
 [model]
 @architectures = "spacy.SpanCategorizer.v1"
@@ -191,6 +184,7 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester:
         "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
         "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
         "save_activations": False,
+        "save_activations": False,
     },
     default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@@ -204,6 +198,7 @@ def make_spancat(
     threshold: float,
     max_positive: Optional[int],
     save_activations: bool,
+    save_activations: bool,
 ) -> "SpanCategorizer":
     """Create a SpanCategorizer component and configure it for multi-label
     classification to be able to assign multiple labels for each span.
@@ -232,6 +227,7 @@ def make_spancat(
     max_positive (Optional[int]): Maximum number of labels to consider positive
         per span. Defaults to None, indicating no limit.
         save_activations (bool): save model activations in Doc when annotating.
+        save_activations (bool): save model activations in Doc when annotating.
     """
     return SpanCategorizer(
         nlp.vocab,
@@ -311,6 +307,7 @@ def make_spancat_singlelabel(
         threshold=None,
         scorer=scorer,
         save_activations=save_activations,
+        save_activations=save_activations,
     )
 
 
@@ -374,6 +371,7 @@ def __init__(
         threshold: Optional[float] = 0.5,
         scorer: Optional[Callable] = spancat_score,
         save_activations: bool = False,
+        save_activations: bool = False,
     ) -> None:
         """Initialize the multi-label or multi-class span categorizer.
 
@@ -424,6 +422,7 @@ def __init__(
         self.name = name
         self.scorer = scorer
         self.save_activations = save_activations
+        self.save_activations = save_activations
 
     @property
     def key(self) -> str:
@@ -481,6 +480,7 @@ def label_data(self) -> List[str]:
         """
         return list(self.labels)
 
+    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
     def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
 
@@ -492,6 +492,8 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         indices = self.suggester(docs, ops=self.model.ops)
         scores = self.model.predict((docs, indices))  # type: ignore
         return {"indices": indices, "scores": scores}
+        scores = self.model.predict((docs, indices))  # type: ignore
+        return {"indices": indices, "scores": scores}
 
     def set_candidates(
         self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@@ -511,11 +513,13 @@ def set_candidates(
             for index in candidates.dataXd:
                 doc.spans[candidates_key].append(doc[index[0] : index[1]])
 
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of Doc objects, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
         activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
+        activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
 
         DOCS: https://spacy.io/api/spancategorizer#set_annotations
         """
@@ -524,9 +528,10 @@ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> Non
         indices = activations["indices"]
         assert isinstance(indices, Ragged)
         scores = cast(Floats2d, activations["scores"])
+
         offset = 0
         for i, doc in enumerate(docs):
-            indices_i = cast(Ints2d, indices[i].dataXd)
+            indices_i = indices[i].dataXd
             if self.save_activations:
                 doc.activations[self.name] = {}
                 doc.activations[self.name]["indices"] = indices_i
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 21c7b3ab0a3..95016072ef3 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,4 +1,9 @@
 # cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Dict, Iterable, List, Optional, Union
+import numpy
+import srsly
+from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
 import warnings
 from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
@@ -22,8 +27,12 @@ from ..util import registry
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 
+
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
+# See #9050
+BACKWARD_OVERWRITE = False
+
 default_model_config = """
 [model]
 @architectures = "spacy.Tagger.v2"
@@ -51,6 +60,13 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
         "neg_prefix": "!",
         "save_activations": False,
     },
+    default_config={
+        "model": DEFAULT_TAGGER_MODEL,
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.tagger_scorer.v1"},
+        "neg_prefix": "!",
+        "save_activations": False,
+    },
     default_score_weights={"tag_acc": 1.0},
 )
 def make_tagger(
@@ -61,6 +77,7 @@ def make_tagger(
     scorer: Optional[Callable],
     neg_prefix: str,
     save_activations: bool,
+    save_activations: bool,
 ):
     """Construct a part-of-speech tagger component.
 
@@ -71,6 +88,8 @@ def make_tagger(
     """
     return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
                   save_activations=save_activations)
+    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
+                  save_activations=save_activations)
 
 
 def tagger_score(examples, **kwargs):
@@ -97,6 +116,7 @@ class Tagger(TrainablePipe):
         scorer=tagger_score,
         neg_prefix="!",
         save_activations: bool = False,
+        save_activations: bool = False,
     ):
         """Initialize a part-of-speech tagger.
 
@@ -108,6 +128,7 @@ class Tagger(TrainablePipe):
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attribute "tag".
         save_activations (bool): save model activations in Doc when annotating.
+        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/tagger#init
         """
@@ -119,6 +140,7 @@ class Tagger(TrainablePipe):
         self.cfg = dict(sorted(cfg.items()))
         self.scorer = scorer
         self.save_activations = save_activations
+        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -137,6 +159,7 @@ class Tagger(TrainablePipe):
         """Data about the labels currently added to the component."""
         return tuple(self.cfg["labels"])
 
+    def predict(self, docs) -> ActivationsT:
     def predict(self, docs) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
 
@@ -151,11 +174,13 @@ class Tagger(TrainablePipe):
             guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
             assert len(guesses) == len(docs)
             return {"probabilities": guesses, "label_ids": guesses}
+            return {"probabilities": guesses, "label_ids": guesses}
         scores = self.model.predict(docs)
         assert len(scores) == len(docs), (len(scores), len(docs))
         guesses = self._scores2guesses(scores)
         assert len(guesses) == len(docs)
         return {"probabilities": scores, "label_ids": guesses}
+        return {"probabilities": scores, "label_ids": guesses}
 
     def _scores2guesses(self, scores):
         guesses = []
@@ -166,21 +191,28 @@ class Tagger(TrainablePipe):
             guesses.append(doc_guesses)
         return guesses
 
+    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
         activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
+        activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
 
         DOCS: https://spacy.io/api/tagger#set_annotations
         """
         batch_tag_ids = activations["label_ids"]
+        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
         cdef bint overwrite = self.cfg["overwrite"]
         labels = self.labels
         for i, doc in enumerate(docs):
+            if self.save_activations:
+                doc.activations[self.name] = {}
+                for act_name, acts in activations.items():
+                    doc.activations[self.name][act_name] = acts[i]
             if self.save_activations:
                 doc.activations[self.name] = {}
                 for act_name, acts in activations.items():
@@ -270,7 +302,7 @@ class Tagger(TrainablePipe):
         student_scores: Scores representing the student model's predictions.
 
         RETURNS (Tuple[float, float]): The loss and the gradient.
-        
+
         DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
         """
         loss_func = SequenceCategoricalCrossentropy(normalize=False)
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 13841dd7bbb..79a98b9bc5f 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
+from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
+from thinc.types import Floats2d
+import numpy
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 309b9a84443..d38beb441da 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Optional, Dict, List, Callable, Any, Union
+from thinc.types import Floats2d
+from thinc.api import Model, Config
+
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
@@ -80,6 +84,8 @@
         "model": DEFAULT_MULTI_TEXTCAT_MODEL,
         "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
         "save_activations": False,
+        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
+        "save_activations": False,
     },
     default_score_weights={
         "cats_score": 1.0,
@@ -101,6 +107,9 @@ def make_multilabel_textcat(
     threshold: float,
     scorer: Optional[Callable],
     save_activations: bool,
+) -> "TextCategorizer":
+    """Create a TextCategorizer component. The text categorizer predicts categories
+    save_activations: bool,
 ) -> "TextCategorizer":
     """Create a TextCategorizer component. The text categorizer predicts categories
     over a whole document. It can learn one or more labels, and the labels are considered
@@ -119,6 +128,12 @@ def make_multilabel_textcat(
         threshold=threshold,
         scorer=scorer,
         save_activations=save_activations,
+        nlp.vocab,
+        model,
+        name,
+        threshold=threshold,
+        scorer=scorer,
+        save_activations=save_activations,
     )
 
 
@@ -151,6 +166,7 @@ def __init__(
         threshold: float,
         scorer: Optional[Callable] = textcat_multilabel_score,
         save_activations: bool = False,
+        save_activations: bool = False,
     ) -> None:
         """Initialize a text categorizer for multi-label classification.
 
@@ -159,7 +175,6 @@ def __init__(
         name (str): The component instance name, used to add entries to the
             losses during training.
         threshold (float): Cutoff to consider a prediction "positive".
-        scorer (Optional[Callable]): The scoring method.
         save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/textcategorizer#init
@@ -172,6 +187,7 @@ def __init__(
         self.cfg = dict(cfg)
         self.scorer = scorer
         self.save_activations = save_activations
+        self.save_activations = save_activations
 
     @property
     def support_missing_values(self):
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 065a6c20d62..b9c297990f9 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -2,10 +2,14 @@
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 
 import srsly
-from thinc.api import Model, Optimizer, set_dropout_rate
+from thinc.api import set_dropout_rate, Model, Optimizer
+import warnings
 
 from ..tokens.doc cimport Doc
 
+from ..training import validate_examples
+from ..errors import Errors, Warnings
+from .pipe import Pipe, deserialize_config
 from .. import util
 from ..errors import Errors
 from ..language import Language
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index 7465c844492..e423965bedc 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -1,3 +1,4 @@
+from typing import cast
 import pickle
 from typing import cast
 
@@ -10,6 +11,7 @@
 from spacy.language import Language
 from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
 from spacy.pipeline.trainable_pipe import TrainablePipe
+from spacy.training import Example
 from spacy.strings import StringStore
 from spacy.training import Example
 from spacy.util import make_tempdir
@@ -403,3 +405,26 @@ def test_save_activations():
     ]
     assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
     assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
+
+
+def test_save_activations():
+    nlp = English()
+    lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer"))
+    lemmatizer.min_tree_freq = 1
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = lemmatizer.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "trainable_lemmatizer" not in doc.activations
+
+    lemmatizer.save_activations = True
+    doc = nlp("This is a test.")
+    assert list(doc.activations["trainable_lemmatizer"].keys()) == [
+        "probabilities",
+        "tree_ids",
+    ]
+    assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
+    assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index e44fef2ad25..804332a9ae8 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,8 +1,9 @@
-from typing import Any, Callable, Dict, Iterable, cast
+from typing import Callable, Iterable, Dict, Any, cast
 
 import pytest
 from numpy.testing import assert_equal
 from thinc.types import Ragged
+from thinc.types import Ragged
 
 from spacy import Language, registry, util
 from spacy.attrs import ENT_KB_ID
@@ -10,8 +11,8 @@
 from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
 from spacy.lang.en import English
 from spacy.ml import load_kb
-from spacy.ml.models.entity_linker import build_span_maker, get_candidates
 from spacy.pipeline import EntityLinker, TrainablePipe
+from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tests.util import make_tempdir
@@ -1292,6 +1293,7 @@ def create_kb(vocab):
     assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
 
 
+def test_save_activations():
 def test_save_activations():
     nlp = English()
     vector_length = 3
@@ -1307,7 +1309,7 @@ def create_kb(vocab):
         # create artificial KB - assign same prior weight to the two russ cochran's
         # Q2146908 (Russ Cochran): American golfer
         # Q7381115 (Russ Cochran): publisher
-        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
         mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
         mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
         mykb.add_alias(
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 542d14d1516..bf2eea8a94e 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -1,5 +1,4 @@
 from typing import cast
-
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import get_current_ops
@@ -10,7 +9,7 @@
 from spacy.language import Language
 from spacy.morphology import Morphology
 from spacy.pipeline import TrainablePipe
-from spacy.tests.util import make_tempdir
+from spacy.attrs import MORPH
 from spacy.tokens import Doc
 from spacy.training import Example
 
@@ -255,3 +254,25 @@ def test_save_activations():
     }
     assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
     assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
+
+
+def test_save_activations():
+    nlp = English()
+    morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
+    train_examples = []
+    for inst in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    doc = nlp("This is a test.")
+    assert "morphologizer" not in doc.activations
+
+    morphologizer.save_activations = True
+    doc = nlp("This is a test.")
+    assert "morphologizer" in doc.activations
+    assert set(doc.activations["morphologizer"].keys()) == {
+        "label_ids",
+        "probabilities",
+    }
+    assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
+    assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 51f943898f1..a594c10b04c 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -1,5 +1,4 @@
 from typing import cast
-
 import pytest
 from numpy.testing import assert_equal
 
@@ -8,6 +7,7 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
+from spacy.pipeline import TrainablePipe
 from spacy.tests.util import make_tempdir
 from spacy.training import Example
 
@@ -133,3 +133,26 @@ def test_save_activations():
     assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
     assert doc.activations["senter"]["probabilities"].shape == (5, nO)
     assert doc.activations["senter"]["label_ids"].shape == (5,)
+
+
+def test_save_activations():
+    # Test if activations are correctly added to Doc when requested.
+    nlp = English()
+    senter = cast(TrainablePipe, nlp.add_pipe("senter"))
+
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = senter.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "senter" not in doc.activations
+
+    senter.save_activations = True
+    doc = nlp("This is a test.")
+    assert "senter" in doc.activations
+    assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
+    assert doc.activations["senter"]["probabilities"].shape == (5, nO)
+    assert doc.activations["senter"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 05e814f0733..50cc828a038 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,5 +1,4 @@
 from typing import cast
-
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import compounding, get_current_ops
@@ -9,7 +8,7 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
-from spacy.training import Example
+from thinc.api import compounding
 
 from ..util import make_tempdir
 
@@ -240,52 +239,6 @@ def test_overfitting_IO():
     assert doc3[0].tag_ != "N"
 
 
-def test_is_distillable():
-    nlp = English()
-    tagger = nlp.add_pipe("tagger")
-    assert tagger.is_distillable
-
-
-def test_distill():
-    teacher = English()
-    teacher_tagger = teacher.add_pipe("tagger")
-    train_examples = []
-    for t in TRAIN_DATA:
-        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
-
-    optimizer = teacher.initialize(get_examples=lambda: train_examples)
-
-    for i in range(50):
-        losses = {}
-        teacher.update(train_examples, sgd=optimizer, losses=losses)
-    assert losses["tagger"] < 0.00001
-
-    student = English()
-    student_tagger = student.add_pipe("tagger")
-    student_tagger.min_tree_freq = 1
-    student_tagger.initialize(
-        get_examples=lambda: train_examples, labels=teacher_tagger.label_data
-    )
-
-    distill_examples = [
-        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
-    ]
-
-    for i in range(50):
-        losses = {}
-        student_tagger.distill(
-            teacher_tagger, distill_examples, sgd=optimizer, losses=losses
-        )
-    assert losses["tagger"] < 0.00001
-
-    test_text = "I like blue eggs"
-    doc = student(test_text)
-    assert doc[0].tag_ == "N"
-    assert doc[1].tag_ == "V"
-    assert doc[2].tag_ == "J"
-    assert doc[3].tag_ == "N"
-
-
 def test_save_activations():
     # Test if activations are correctly added to Doc when requested.
     nlp = English()
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 2bba40d1d13..a54bf394608 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,3 +1,4 @@
+from typing import cast
 import random
 from typing import cast
 
@@ -13,16 +14,12 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TextCategorizer, TrainablePipe
-from spacy.pipeline.textcat import (
-    single_label_bow_config,
-    single_label_cnn_config,
-    single_label_default_config,
-)
-from spacy.pipeline.textcat_multilabel import (
-    multi_label_bow_config,
-    multi_label_cnn_config,
-    multi_label_default_config,
-)
+from spacy.pipeline.textcat import single_label_bow_config
+from spacy.pipeline.textcat import single_label_cnn_config
+from spacy.pipeline.textcat import single_label_default_config
+from spacy.pipeline.textcat_multilabel import multi_label_bow_config
+from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
+from spacy.pipeline.textcat_multilabel import multi_label_default_config
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, DocBin
@@ -304,6 +301,7 @@ def test_issue9904():
 
     examples = get_examples()
     scores = textcat.predict([eg.predicted for eg in examples])["probabilities"]
+    scores = textcat.predict([eg.predicted for eg in examples])["probabilities"]
 
     loss = textcat.get_loss(examples, scores)[0]
     loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0]
@@ -962,9 +960,11 @@ def test_textcat_multi_threshold():
     assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
 
 
+def test_save_activations():
 def test_save_activations():
     nlp = English()
     textcat = cast(TrainablePipe, nlp.add_pipe("textcat"))
+    textcat = cast(TrainablePipe, nlp.add_pipe("textcat"))
 
     train_examples = []
     for text, annotations in TRAIN_DATA_SINGLE_LABEL:
@@ -981,6 +981,34 @@ def test_save_activations():
     assert doc.activations["textcat"]["probabilities"].shape == (nO,)
 
 
+def test_save_activations_multi():
+    nlp = English()
+    textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel"))
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA_MULTI_LABEL:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+    nlp.initialize(get_examples=lambda: train_examples)
+    nO = textcat.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "textcat_multilabel" not in doc.activations
+
+    textcat.save_activations = True
+    doc = nlp("This is a test.")
+    assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
+    assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
+    nO = textcat.model.get_dim("nO")
+
+    doc = nlp("This is a test.")
+    assert "textcat" not in doc.activations
+
+    textcat.save_activations = True
+    doc = nlp("This is a test.")
+    assert list(doc.activations["textcat"].keys()) == ["probabilities"]
+    assert doc.activations["textcat"]["probabilities"].shape == (nO,)
+
+
 def test_save_activations_multi():
     nlp = English()
     textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel"))
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 9fb6a72c87f..fc0404f1423 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -50,6 +50,8 @@ cdef class Doc:
 
     cdef public dict activations
 
+    cdef public dict activations
+
     cdef public dict user_hooks
     cdef public dict user_token_hooks
     cdef public dict user_span_hooks
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 1304a8aae8d..d83aa0e5486 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -15,7 +15,7 @@ from typing import (
 
 import numpy as np
 from cymem.cymem import Pool
-from thinc.types import Floats1d, Floats2d, Ints2d
+from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
 from .span import Span
 from .token import Token
 from .span_groups import SpanGroups
@@ -37,6 +37,7 @@ class Doc:
     spans: SpanGroups
     max_length: int
     length: int
+    sentiment: float
     activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
     cats: Dict[str, float]
     user_hooks: Dict[str, Callable[..., Any]]
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index e92c0e833e0..842b2181a81 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -762,6 +762,7 @@ The L2 norm of the document's vector representation.
 | `user_data`                                | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                               |
 | `lang` <Tag variant="new">2.1</Tag>        | Language of the document's vocabulary. ~~int~~                                                                                                 |
 | `lang_` <Tag variant="new">2.1</Tag>       | Language of the document's vocabulary. ~~str~~                                                                                                 |
+| `sentiment`                                | The document's positivity/negativity score, if available. ~~float~~                                                                            |
 | `user_hooks`                               | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                      |
 | `user_token_hooks`                         | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                              |
 | `user_span_hooks`                          | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                               |
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index 12b2f6bef1d..238b62a2e6d 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -63,7 +63,7 @@ architectures and their arguments and hyperparameters.
 | `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
 | `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
 | `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                   |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                    |
 | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
 | `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 61abe043e77..4660ec312fa 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters.
 | Setting                                         | Description                                                                                                                                                                                                                                                            |
 | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `model`                                         | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                 |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
 | `extend` <Tag variant="new">3.2</Tag>           | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
 | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                       |
@@ -454,8 +454,8 @@ coarse-grained POS as the feature `POS`.
 > assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels
 > ```
 
-| Name        | Description                                            |
-| ----------- | ------------------------------------------------------ |
+| Name        | Description                                               |
+| ----------- | --------------------------------------------------------- |
 | **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ |
 
 ## Morphologizer.label_data {id="label_data",tag="property",version="3"}

From 2ae6f98351f0fb3636e63a6d40757580936afc3a Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 15 Sep 2022 17:06:58 +0200
Subject: [PATCH 342/504] disable mypy run for Python 3.10 (#11508) (#11512)

---
 .github/azure-steps.yml | 44 ++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 11dc7e295e4..c7722391fec 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -64,22 +64,12 @@ steps:
     displayName: "Run GPU tests"
     condition: eq(${{ parameters.gpu }}, true)
 
-#  - script: |
-#      python -m spacy download ca_core_news_sm
-#      python -m spacy download ca_core_news_md
-#      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-#    displayName: 'Test download CLI'
-#    condition: eq(variables['python_version'], '3.8')
-#
-#  - script: |
-#      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-#    displayName: 'Test no warnings on load (#11713)'
-#    condition: eq(variables['python_version'], '3.8')
-#
-#  - script: |
-#      python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping
-#    displayName: 'Test skip re-download (#12188)'
-#    condition: eq(variables['python_version'], '3.8')
+  - script: |
+      python -m spacy download ca_core_news_sm
+      python -m spacy download ca_core_news_md
+      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+    displayName: 'Test download CLI'
+    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@@ -103,17 +93,17 @@ steps:
     displayName: 'Test train CLI'
     condition: eq(variables['python_version'], '3.8')
 
-#  - script: |
-#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-#      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-#    displayName: 'Test assemble CLI'
-#    condition: eq(variables['python_version'], '3.8')
-#
-#  - script: |
-#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-#      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-#    displayName: 'Test assemble CLI vectors warning'
-#    condition: eq(variables['python_version'], '3.8')
+  - script: |
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+    displayName: 'Test assemble CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+    displayName: 'Test assemble CLI vectors warning'
+    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python .github/validate_universe_json.py website/meta/universe.json

From 2f720d1ab9f77c89d435ca44b150608d2fafe999 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 3 Oct 2022 14:41:15 +0200
Subject: [PATCH 343/504] fix test for EL activations with refactored KB

---
 spacy/tests/pipeline/test_entity_linker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 804332a9ae8..d2d07929a70 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1309,7 +1309,7 @@ def create_kb(vocab):
         # create artificial KB - assign same prior weight to the two russ cochran's
         # Q2146908 (Russ Cochran): American golfer
         # Q7381115 (Russ Cochran): publisher
-        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
         mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
         mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
         mykb.add_alias(

From 014530a0d118f260d661483aaa76a487a083a18e Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 6 Oct 2022 10:51:06 +0200
Subject: [PATCH 344/504] `StringStore` refactoring (#11344)

* `strings`: Remove unused `hash32_utf8` function

* `strings`: Make `hash_utf8` and `decode_Utf8Str` private

* `strings`: Reorganize private functions

* 'strings': Raise error when non-string/-int types are passed to functions that don't accept them

* `strings`: Add `items()` method, add type hints, remove unused methods, restrict inputs to specific types, reorganize methods

* `Morphology`: Use `StringStore.items()` to enumerate features when pickling

* `test_stringstore`: Update pre-Python 3 tests

* Update `StringStore` docs

* Fix `get_string_id` imports

* Replace redundant test with tests for type checking

* Rename `_retrieve_interned_str`, remove `.get` default arg

* Add `get_string_id` to `strings.pyi`
Remove `mypy` ignore directives from imports of the above

* `strings.pyi`: Replace functions that consume `Union`-typed params with overloads

* `strings.pyi`: Revert some function signatures

* Update `SYMBOLS_BY_INT` lookups and error codes post-merge

* Revert clobbered change introduced in a previous merge

* Remove unnecessary type hint

* Invert tuple order in `StringStore.items()`

* Add test for `StringStore.items()`

* Revert "`Morphology`: Use `StringStore.items()` to enumerate features when pickling"

This reverts commit 1af9510ceb6b08cfdcfbf26df6896f26709fac0d.

* Rename `keys` and `key_map`

* Add `keys()` and `values()`

* Add comment about the inverted key-value semantics in the API

* Fix type hints

* Implement `keys()`, `values()`, `items()` without generators

* Fix type hints, remove unnecessary boxing

* Update docs

* Simplify `keys/values/items()` impl

* `mypy` fix

* Fix error message, doc fixes
---
 spacy/errors.py                  |  4 ++
 spacy/matcher/matcher.pyx        |  4 ++
 spacy/strings.pxd                |  3 +
 spacy/strings.pyi                | 15 ++++-
 spacy/strings.pyx                | 98 +++++++++++++++++++-------------
 spacy/tokens/graph.pyx           | 28 ++++-----
 spacy/tokens/retokenizer.pyx     |  3 -
 website/docs/api/stringstore.mdx | 22 ++++++-
 8 files changed, 118 insertions(+), 59 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 6bb7d992954..de47325332a 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -256,6 +256,8 @@ class Errors(metaclass=ErrorsWithCodes):
             "https://spacy.io/usage/models")
     E011 = ("Unknown operator: '{op}'. Options: {opts}")
     E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
+    E016 = ("MultitaskObjective target should be function or one of: dep, "
+            "tag, ent, dep_tag_offset, ent_tag.")
     E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
     E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
             "refers to an issue with the `Vocab` or `StringStore`.")
@@ -982,6 +984,8 @@ class Errors(metaclass=ErrorsWithCodes):
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
+    E4001 = ("Expected input to be one of the following types: ({expected_types}), "
+             "but got '{received_type}'")
 
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 8accd8c4465..73d60767b2f 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -32,6 +32,10 @@ from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
 from ..vocab cimport Vocab
 
+from ..schemas import validate_token_pattern
+from ..errors import Errors, MatchPatternError, Warnings
+from ..strings cimport get_string_id
+from ..attrs import IDS
 from ..errors import Errors, MatchPatternError, Warnings
 from ..schemas import validate_token_pattern
 from .levenshtein import levenshtein_compare
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index c05731c9a15..688dbc46261 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,3 +1,6 @@
+from libc.stdint cimport int64_t, uint32_t
+from libcpp.vector cimport vector
+from libcpp.set cimport set
 from cymem.cymem cimport Pool
 from libc.stdint cimport int64_t, uint32_t
 from libcpp.set cimport set
diff --git a/spacy/strings.pyi b/spacy/strings.pyi
index 98224fcd449..1da5443fb11 100644
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@@ -1,20 +1,29 @@
+from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overload
 from pathlib import Path
-from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union, overload
+from typing import Any, Iterable, Iterator, Optional, Union, overload
 
 class StringStore:
-    def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ...
+    def __init__(self, strings: Optional[Iterable[str]]) -> None: ...
     @overload
     def __getitem__(self, string_or_hash: str) -> int: ...
+    def __getitem__(self, string_or_hash: str) -> int: ...
     @overload
     def __getitem__(self, string_or_hash: int) -> str: ...
     def as_int(self, string_or_hash: Union[str, int]) -> int: ...
     def as_string(self, string_or_hash: Union[str, int]) -> str: ...
+    def __getitem__(self, string_or_hash: int) -> str: ...
+    def as_int(self, string_or_hash: Union[str, int]) -> int: ...
+    def as_string(self, string_or_hash: Union[str, int]) -> str: ...
     def add(self, string: str) -> int: ...
     def items(self) -> List[Tuple[str, int]]: ...
     def keys(self) -> List[str]: ...
     def values(self) -> List[int]: ...
+    def items(self) -> List[Tuple[str, int]]: ...
+    def keys(self) -> List[str]: ...
+    def values(self) -> List[int]: ...
     def __len__(self) -> int: ...
     def __contains__(self, string_or_hash: Union[str, int]) -> bool: ...
+    def __contains__(self, string_or_hash: Union[str, int]) -> bool: ...
     def __iter__(self) -> Iterator[str]: ...
     def __reduce__(self) -> Any: ...
     def to_disk(self, path: Union[str, Path]) -> None: ...
@@ -24,3 +33,5 @@ class StringStore:
     def _reset_and_load(self, strings: Iterable[str]) -> None: ...
 
 def get_string_id(string_or_hash: Union[str, int]) -> int: ...
+
+def get_string_id(string_or_hash: Union[str, int]) -> int: ...
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index a80985f6ff2..a20e07a9482 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,9 +1,7 @@
 # cython: infer_types=True
-from typing import Iterable, Iterator, List, Optional, Tuple, Union
-
+from typing import Optional, Union, Iterable, Tuple, Callable, Any, List, Iterator
+cimport cython
 from libc.stdint cimport uint32_t
-from libc.string cimport memcpy
-from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 import srsly
@@ -16,11 +14,14 @@ from .symbols import IDS as SYMBOLS_BY_STR
 from .symbols import NAMES as SYMBOLS_BY_INT
 
 
+
 cdef class StringStore:
     """Look up strings by 64-bit hashes. Implicitly handles reserved symbols.
+    """Look up strings by 64-bit hashes. Implicitly handles reserved symbols.
 
     DOCS: https://spacy.io/api/stringstore
     """
+    def __init__(self, strings: Optional[Iterable[str]] = None):
     def __init__(self, strings: Optional[Iterable[str]] = None):
         """Create the StringStore.
 
@@ -32,34 +33,16 @@ cdef class StringStore:
             for string in strings:
                 self.add(string)
 
-    def __getitem__(self, object string_or_id):
-        """Retrieve a string from a given hash, or vice versa.
+    def __getitem__(self, string_or_hash: Union[str, int]) -> Union[str, int]:
+        """Retrieve a string from a given hash. If a string
+        is passed as the input, add it to the store and return
+        its hash.
 
-        string_or_id (bytes, str or uint64): The value to encode.
-        Returns (str / uint64): The value to be retrieved.
+        string_or_hash (int / str): The hash value to lookup or the string to store.
+        RETURNS (str / int): The stored string or the hash of the newly added string.
         """
-        cdef hash_t str_hash
-        cdef Utf8Str* utf8str = NULL
-
-        if isinstance(string_or_id, str):
-            if len(string_or_id) == 0:
-                return 0
-
-            # Return early if the string is found in the symbols LUT.
-            symbol = SYMBOLS_BY_STR.get(string_or_id, None)
-            if symbol is not None:
-                return symbol
-            else:
-                return hash_string(string_or_id)
-        elif isinstance(string_or_id, bytes):
-            return hash_utf8(string_or_id, len(string_or_id))
-        elif _try_coerce_to_hash(string_or_id, &str_hash):
-            if str_hash == 0:
-                return ""
-            elif str_hash in SYMBOLS_BY_INT:
-                return SYMBOLS_BY_INT[str_hash]
-            else:
-                utf8str = <Utf8Str*>self._map.get(str_hash)
+        if isinstance(string_or_hash, str):
+            return self.add(string_or_hash)
         else:
             return self._get_interned_str(string_or_hash)
 
@@ -129,13 +112,24 @@ cdef class StringStore:
         if isinstance(string_or_hash, str):
             return string_or_hash
         else:
-            # TODO: Raise an error instead
-            return self._map.get(string_or_id) is not NULL
+            return self._get_interned_str(string_or_hash)
 
-        if str_hash in SYMBOLS_BY_INT:
-            return True
-        else:
-            return self._map.get(str_hash) is not NULL
+    def items(self) -> List[Tuple[str, int]]:
+        """Iterate over the stored strings and their hashes in insertion order.
+
+        RETURNS: A list of string-hash pairs.
+        """
+        # Even though we internally store the hashes as keys and the strings as
+        # values, we invert the order in the public API to keep it consistent with
+        # the implementation of the `__iter__` method (where we wish to iterate over
+        # the strings in the store).
+        cdef int i
+        pairs = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            str_hash = self._keys[i]
+            utf8str = <Utf8Str*>self._map.get(str_hash)
+            pairs[i] = (self._decode_str_repr(utf8str), str_hash)
+        return pairs
 
     def keys(self) -> List[str]:
         """Iterate over the stored strings in insertion order.
@@ -210,6 +204,7 @@ cdef class StringStore:
         self.mem = Pool()
         self._map = PreshMap()
         self._keys.clear()
+        self._keys.clear()
         for string in strings:
             self.add(string)
 
@@ -232,15 +227,40 @@ cdef class StringStore:
         else:
             return self._decode_str_repr(utf8str)
 
+    cdef hash_t _intern_str(self, str string):
+    def _get_interned_str(self, hash_value: int) -> str:
+        cdef hash_t str_hash
+        if not _try_coerce_to_hash(hash_value, &str_hash):
+            raise TypeError(Errors.E4001.format(expected_types="'int'", received_type=type(hash_value)))
+
+        # Handle reserved symbols and empty strings correctly.
+        if str_hash == 0:
+            return ""
+
+        symbol = SYMBOLS_BY_INT.get(str_hash)
+        if symbol is not None:
+            return symbol
+
+        utf8str = <Utf8Str*>self._map.get(str_hash)
+        if utf8str is NULL:
+            raise KeyError(Errors.E018.format(hash_value=str_hash))
+        else:
+            return self._decode_str_repr(utf8str)
+
     cdef hash_t _intern_str(self, str string):
         # TODO: This function's API/behaviour is an unholy mess...
         # 0 means missing, but we don't bother offsetting the index.
         chars = string.encode('utf-8')
         cdef hash_t key = hash64(<unsigned char*>chars, len(chars), 1)
+        chars = string.encode('utf-8')
+        cdef hash_t key = hash64(<unsigned char*>chars, len(chars), 1)
         cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
         if value is not NULL:
             return key
 
+        value = self._allocate_str_repr(<unsigned char*>chars, len(chars))
+            return key
+
         value = self._allocate_str_repr(<unsigned char*>chars, len(chars))
         self._map.set(key, value)
         self._keys.push_back(key)
@@ -250,6 +270,7 @@ cdef class StringStore:
         cdef int n_length_bytes
         cdef int i
         cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
+        cdef uint32_t ulength = length
         if length < sizeof(string.s):
             string.s[0] = <unsigned char>length
             memcpy(&string.s[1], chars, length)
@@ -307,7 +328,7 @@ cpdef hash_t get_string_id(object string_or_hash) except -1:
 
     try:
         return hash_string(string_or_hash)
-    except:   # no-cython-lint
+    except:
         if _try_coerce_to_hash(string_or_hash, &str_hash):
             # Coerce the integral key to the expected primitive hash type.
             # This ensures that custom/overloaded "primitive" data types
@@ -324,5 +345,6 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
     try:
         out_hash[0] = key
         return True
-    except:  # no-cython-lint
+    except:
         return False
+
diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx
index 7ded04500a3..fea061c143d 100644
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@@ -18,6 +18,8 @@ from .. import Errors
 from ..strings cimport get_string_id
 from ..structs cimport EdgeC, GraphC
 from ..typedefs cimport hash_t
+from ..strings cimport get_string_id
+from ..structs cimport EdgeC, GraphC
 
 from .token import Token
 
@@ -131,9 +133,9 @@ cdef class Node:
 
         For instance, `node.head(i=1)` will get the head of the second edge that
         this node is a tail of. `node.head(i=1, label="ARG0")` will further
-        check that the second edge has the label `"ARG0"`. 
+        check that the second edge has the label `"ARG0"`.
 
-        If no matching node can be found, the graph's NoneNode is returned. 
+        If no matching node can be found, the graph's NoneNode is returned.
         """
         return self.headed(i=i, label=label)
 
@@ -141,14 +143,14 @@ cdef class Node:
         """Get the tail of the first matching edge, searching by index, label,
         both or neither.
 
-        If no matching node can be found, the graph's NoneNode is returned. 
+        If no matching node can be found, the graph's NoneNode is returned.
         """
         return self.tailed(i=i, label=label).tail
 
     def sibling(self, i=None, label=None):
         """Get the first matching sibling node. Two nodes are siblings if they
         are both tails of the same head.
-        If no matching node can be found, the graph's NoneNode is returned. 
+        If no matching node can be found, the graph's NoneNode is returned.
         """
         if i is None:
             siblings = self.siblings(label=label)
@@ -337,10 +339,10 @@ cdef class NoneNode(Node):
         return self
 
     def walk_heads(self):
-        yield from [] 
+        yield from []
 
     def walk_tails(self):
-        yield from [] 
+        yield from []
 
 
 cdef class Graph:
@@ -387,7 +389,7 @@ cdef class Graph:
             be labelled with the empty string (""). If `labels` is not `None`,
             it must have the same length as the `edges` argument.
         weights (Optional[List[float]]): A list of weights for the provided edges.
-            If None, all of the edges specified by the edges argument will 
+            If None, all of the edges specified by the edges argument will
             have the weight 0.0. If `weights` is not `None`, it must have the
             same length as the `edges` argument.
         """
@@ -486,7 +488,7 @@ cdef class Graph:
         """
         if isinstance(indices, Node):
             return indices
-        cdef vector[int32_t] node 
+        cdef vector[int32_t] node
         node.reserve(len(indices))
         for idx in indices:
             node.push_back(idx)
@@ -499,7 +501,7 @@ cdef class Graph:
         """
         if isinstance(indices, Node):
             return indices
-        cdef vector[int32_t] node 
+        cdef vector[int32_t] node
         node.reserve(len(indices))
         for idx in indices:
             node.push_back(idx)
@@ -588,7 +590,7 @@ cdef int get_head_nodes(vector[int]& output, const GraphC* graph, int node) nogi
     if todo == 0:
         return 0
     output.reserve(output.size() + todo)
-    start = graph.first_head[node] 
+    start = graph.first_head[node]
     end = graph.edges.size()
     for i in range(start, end):
         if todo <= 0:
@@ -604,7 +606,7 @@ cdef int get_tail_nodes(vector[int]& output, const GraphC* graph, int node) nogi
     if todo == 0:
         return 0
     output.reserve(output.size() + todo)
-    start = graph.first_tail[node] 
+    start = graph.first_tail[node]
     end = graph.edges.size()
     for i in range(start, end):
         if todo <= 0:
@@ -633,7 +635,7 @@ cdef int get_head_edges(vector[int]& output, const GraphC* graph, int node) nogi
     if todo == 0:
         return 0
     output.reserve(output.size() + todo)
-    start = graph.first_head[node] 
+    start = graph.first_head[node]
     end = graph.edges.size()
     for i in range(start, end):
         if todo <= 0:
@@ -649,7 +651,7 @@ cdef int get_tail_edges(vector[int]& output, const GraphC* graph, int node) nogi
     if todo == 0:
         return 0
     output.reserve(output.size() + todo)
-    start = graph.first_tail[node] 
+    start = graph.first_tail[node]
     end = graph.edges.size()
     for i in range(start, end):
         if todo <= 0:
diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx
index 7b6501d4442..68631e7547f 100644
--- a/spacy/tokens/retokenizer.pyx
+++ b/spacy/tokens/retokenizer.pyx
@@ -16,9 +16,6 @@ from .token cimport Token
 
 from ..attrs import intify_attrs
 from ..errors import Errors
-from ..util import SimpleFrozenDict
-from .underscore import is_writable_attr
-
 from ..strings cimport get_string_id
 
 
diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index 1b1f3bd5352..9b63d586b87 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -47,6 +47,8 @@ Get the number of strings in the store.
 
 ## StringStore.\_\_getitem\_\_ {id="getitem",tag="method"}
 
+Retrieve a string from a given hash. If a string is passed as the input, add it
+to the store and return its hash.
 Retrieve a string from a given hash. If a string is passed as the input, add it
 to the store and return its hash.
 
@@ -59,6 +61,10 @@ to the store and return its hash.
 > assert stringstore[apple_hash] == "apple"
 > ```
 
+| Name             | Description                                                                  |
+| ---------------- | ---------------------------------------------------------------------------- |
+| `string_or_hash` | The hash value to lookup or the string to store. ~~Union[str, int]~~         |
+| **RETURNS**      | The stored string or the hash of the newly added string. ~~Union[str, int]~~ |
 | Name             | Description                                                                  |
 | ---------------- | ---------------------------------------------------------------------------- |
 | `string_or_hash` | The hash value to lookup or the string to store. ~~Union[str, int]~~         |
@@ -66,6 +72,7 @@ to the store and return its hash.
 
 ## StringStore.\_\_contains\_\_ {id="contains",tag="method"}
 
+Check whether a string or a hash is in the store.
 Check whether a string or a hash is in the store.
 
 > #### Example
@@ -76,6 +83,10 @@ Check whether a string or a hash is in the store.
 > assert not "cherry" in stringstore
 > ```
 
+| Name             | Description                                             |
+| ---------------- | ------------------------------------------------------- |
+| `string_or_hash` | The string or hash to check. ~~Union[str, int]~~        |
+| **RETURNS**      | Whether the store contains the string or hash. ~~bool~~ |
 | Name             | Description                                             |
 | ---------------- | ------------------------------------------------------- |
 | `string_or_hash` | The string or hash to check. ~~Union[str, int]~~        |
@@ -83,6 +94,7 @@ Check whether a string or a hash is in the store.
 
 ## StringStore.\_\_iter\_\_ {id="iter",tag="method"}
 
+Iterate over the stored strings in insertion order.
 Iterate over the stored strings in insertion order.
 
 > #### Example
@@ -93,11 +105,14 @@ Iterate over the stored strings in insertion order.
 > assert all_strings == ["apple", "orange"]
 > ```
 
+| Name        | Description                    |
+| ----------- | ------------------------------ |
+| **RETURNS** | A string in the store. ~~str~~ |
 | Name        | Description                    |
 | ----------- | ------------------------------ |
 | **RETURNS** | A string in the store. ~~str~~ |
 
-## StringStore.items {id="items", tag="method", version="4"}
+## StringStore.items {#iter tag="method" new="4"}
 
 Iterate over the stored string-hash pairs in insertion order.
 
@@ -113,7 +128,7 @@ Iterate over the stored string-hash pairs in insertion order.
 | ----------- | ------------------------------------------------------ |
 | **RETURNS** | A list of string-hash pairs. ~~List[Tuple[str, int]]~~ |
 
-## StringStore.keys {id="keys", tag="method", version="4"}
+## StringStore.keys {#iter tag="method" new="4"}
 
 Iterate over the stored strings in insertion order.
 
@@ -129,7 +144,7 @@ Iterate over the stored strings in insertion order.
 | ----------- | -------------------------------- |
 | **RETURNS** | A list of strings. ~~List[str]~~ |
 
-## StringStore.values {id="values", tag="method", version="4"}
+## StringStore.values {#iter tag="method" new="4"}
 
 Iterate over the stored string hashes in insertion order.
 
@@ -165,6 +180,7 @@ Add a string to the `StringStore`.
 | `string`    | The string to add. ~~str~~       |
 | **RETURNS** | The string's hash value. ~~int~~ |
 
+## StringStore.to_disk {#to_disk tag="method"}
 ## StringStore.to_disk {#to_disk tag="method"}
 
 Save the current state to a directory.

From 7a17461462d0e4c2ba22bc30dcc48ce0c5d0be95 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 21 Oct 2022 18:01:18 +0900
Subject: [PATCH 345/504] Remove thinc util reimports (#11665)

* Remove imports marked as v2 leftovers

There are a few functions that were in `spacy.util` in v2, but were
moved to Thinc. In v3 these were imported in `spacy.util` so that code
could be used unchanged, but the comment over them indicates they should
always be imported from Thinc. This commit removes those imports.

It doesn't look like any DeprecationWarning was ever thrown for using
these, but it is probably fine to remove them anyway with a major
version. It is not clear that they were widely used.

* Import fix_random_seed correctly

This seems to be the only place in spaCy that was using the old import.
---
 spacy/tests/pipeline/test_spancat.py | 7 +++----
 spacy/util.py                        | 3 +++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 42eb90a1bb1..5dcc2e70f67 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -1,7 +1,6 @@
 import numpy
-import pytest
-from numpy.testing import assert_almost_equal, assert_array_equal
-from thinc.api import NumpyOps, Ragged, fix_random_seed, get_current_ops
+from numpy.testing import assert_array_equal, assert_almost_equal
+from thinc.api import get_current_ops, Ragged, fix_random_seed
 
 from spacy import util
 from spacy.lang.en import English
@@ -9,7 +8,7 @@
 from spacy.tokens import SpanGroup
 from spacy.tokens.span_groups import SpanGroups
 from spacy.training import Example
-from spacy.util import make_tempdir, registry
+from spacy.util import registry, make_tempdir
 
 OPS = get_current_ops()
 
diff --git a/spacy/util.py b/spacy/util.py
index ae9837e3afe..fdc02a717cc 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -59,6 +59,9 @@
     cupy = None
 
 
+from .symbols import ORTH
+from .compat import cupy, CudaStream, is_windows, importlib_metadata
+from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
 from . import about
 from .compat import CudaStream, cupy, is_windows
 from .errors import Errors, Warnings

From beade22e90848f37754736bb4eb450e4f38738dc Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 24 Oct 2022 09:11:35 +0200
Subject: [PATCH 346/504] Replace EntityRuler with SpanRuler implementation
 (#11320)

* Replace EntityRuler with SpanRuler implementation

Remove `EntityRuler` and rename the `SpanRuler`-based
`future_entity_ruler` to `entity_ruler`.

Main changes:

* It is no longer possible to load patterns on init as with
`EntityRuler(patterns=)`.
* The older serialization formats (`patterns.jsonl`) are no longer
supported and the related tests are removed.
* The config settings are only stored in the config, not in the
serialized component (in particular the `phrase_matcher_attr` and
overwrite settings).

* Add migration guide to EntityRuler API docs

* docs update

* Minor edit

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/pipeline/__init__.py                    |   1 -
 spacy/pipeline/span_ruler.py                  |   8 +
 spacy/tests/pipeline/test_entity_ruler.py     |   6 +
 .../serialize/test_serialize_pipeline.py      |  17 +-
 website/docs/api/entityruler.mdx              | 298 +++---------------
 website/docs/usage/rule-based-matching.mdx    |   8 +-
 website/docs/usage/saving-loading.mdx         |   6 +-
 7 files changed, 67 insertions(+), 277 deletions(-)

diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index af5bb05a0f7..4d1f0d90663 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -4,7 +4,6 @@
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
 from .ner import EntityRecognizer
-from .entity_ruler import EntityRuler
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .ner import EntityRecognizer
diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index cd8fea36b47..3f876598013 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -17,6 +17,14 @@
 
 import srsly
 
+from .pipe import Pipe
+from ..training import Example
+from ..language import Language
+from ..errors import Errors, Warnings
+from ..util import ensure_path, SimpleFrozenList, registry
+from ..tokens import Doc, Span
+from ..scorer import Scorer, get_ner_prf
+from ..matcher import Matcher, PhraseMatcher
 from .. import util
 from ..errors import Errors, Warnings
 from ..language import Language
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 520012c5075..31feb73edde 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -2,6 +2,12 @@
 from thinc.api import NumpyOps, get_current_ops
 
 from spacy import registry
+from spacy.tokens import Doc, Span
+from spacy.language import Language
+from spacy.lang.en import English
+from spacy.pipeline import EntityRecognizer, merge_entities
+from spacy.pipeline import SpanRuler
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
 from spacy.lang.en import English
 from spacy.language import Language
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index d5f2f13af4f..91f6098255e 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -8,14 +8,9 @@
 from spacy import Vocab, load, registry
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import (
-    DependencyParser,
-    EntityRecognizer,
-    SentenceRecognizer,
-    Tagger,
-    TextCategorizer,
-    TrainablePipe,
-)
+from spacy.pipeline import DependencyParser, EntityRecognizer
+from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer
+from spacy.pipeline import TrainablePipe
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
@@ -92,12 +87,17 @@ def test_issue_3526_1(en_vocab):
     nlp = Language(vocab=en_vocab)
     ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
     ruler.add_patterns(patterns)
+    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
+    ruler.add_patterns(patterns)
     ruler_bytes = ruler.to_bytes()
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     new_ruler = nlp.add_pipe(
         "entity_ruler", name="new_ruler", config={"overwrite_ents": True}
     )
+    new_ruler = nlp.add_pipe(
+        "entity_ruler", name="new_ruler", config={"overwrite_ents": True}
+    )
     new_ruler = new_ruler.from_bytes(ruler_bytes)
     assert len(new_ruler) == len(ruler)
     assert len(new_ruler.labels) == 4
@@ -121,6 +121,7 @@ def test_issue_3526_4(en_vocab):
 
 @pytest.mark.issue(4042)
 def test_issue4042():
+    """Test that serialization of an entity_ruler before NER works fine."""
     """Test that serialization of an entity_ruler before NER works fine."""
     nlp = English()
     # add ner pipe
diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx
index 293162572c6..7976e7725e0 100644
--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@@ -1,7 +1,5 @@
 ---
 title: EntityRuler
-tag: class
-source: spacy/pipeline/entity_ruler.py
 new: 2.1
 teaser: 'Pipeline component for rule-based named entity recognition'
 api_string_name: entity_ruler
@@ -77,273 +75,51 @@ how the component should be configured. You can override its settings via the
 | `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                       |
 | `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                 |
 
-```python
-%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py
-```
-
-## EntityRuler.\_\_init\_\_ {id="init",tag="method"}
+## Migrating from v3 {#migrating}
 
-Initialize the entity ruler. If patterns are supplied here, they need to be a
-list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either
-be a token pattern (list) or a phrase pattern (string). For example:
-`{"label": "ORG", "pattern": "Apple"}`.
+### Loading patterns
 
-> #### Example
->
-> ```python
-> # Construction via add_pipe
-> ruler = nlp.add_pipe("entity_ruler")
->
-> # Construction from class
-> from spacy.pipeline import EntityRuler
-> ruler = EntityRuler(nlp, overwrite_ents=True)
-> ```
+Unlike the v3 `EntityRuler`, the `SpanRuler` cannot load patterns on
+initialization with `SpanRuler(patterns=patterns)` or directly from a JSONL file
+path with `SpanRuler.from_disk(jsonl_path)`. Patterns should be loaded from the
+JSONL file separately and then added through
+[`SpanRuler.initialize`](/api/spanruler#initialize]) or
+[`SpanRuler.add_patterns`](/api/spanruler#add_patterns).
 
-| Name                                                 | Description                                                                                                                                                                                                                           |
-| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `nlp`                                                | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~                                                                                                                                     |
-| `name` <Tag variant="new">3</Tag>                    | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
-| _keyword-only_                                       |                                                                                                                                                                                                                                       |
-| `phrase_matcher_attr`                                | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~                                         |
-| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~                                                                                                     |
-| `validate`                                           | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~                                                                                                                |
-| `overwrite_ents`                                     | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~                                                                                             |
-| `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                                                               |
-| `patterns`                                           | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~                                                                                                                                 |
-| `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                                                         |
-
-## EntityRuler.initialize {id="initialize",tag="method",version="3"}
-
-Initialize the component with data and used before training to load in rules
-from a [pattern file](/usage/rule-based-matching/#entityruler-files). This
-method is typically called by [`Language.initialize`](/api/language#initialize)
-and lets you customize arguments it receives via the
-[`[initialize.components]`](/api/data-formats#config-initialize) block in the
-config.
-
-> #### Example
->
-> ```python
-> entity_ruler = nlp.add_pipe("entity_ruler")
-> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
-> ```
->
-> ```ini
-> ### config.cfg
-> [initialize.components.entity_ruler]
->
-> [initialize.components.entity_ruler.patterns]
-> @readers = "srsly.read_jsonl.v1"
-> path = "corpus/entity_ruler_patterns.jsonl
-> ```
-
-| Name           | Description                                                                                                                                                          |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
-| _keyword-only_ |                                                                                                                                                                      |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                 |
-| `patterns`     | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~                                                        |
-
-## EntityRuler.\_\_len\_\_ {id="len",tag="method"}
-
-The number of all patterns added to the entity ruler.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> assert len(ruler) == 0
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-> assert len(ruler) == 1
-> ```
-
-| Name        | Description                     |
-| ----------- | ------------------------------- |
-| **RETURNS** | The number of patterns. ~~int~~ |
-
-## EntityRuler.\_\_contains\_\_ {id="contains",tag="method"}
-
-Whether a label is present in the patterns.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-> assert "ORG" in ruler
-> assert not "PERSON" in ruler
-> ```
-
-| Name        | Description                                           |
-| ----------- | ----------------------------------------------------- |
-| `label`     | The label to check. ~~str~~                           |
-| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ |
-
-## EntityRuler.\_\_call\_\_ {id="call",tag="method"}
-
-Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
-happens automatically after the component has been added to the pipeline using
-[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
-with `overwrite_ents=True`, existing entities will be replaced if they overlap
-with the matches. When matches overlap in a Doc, the entity ruler prioritizes
-longer patterns over shorter, and if equal the match occuring first in the Doc
-is chosen.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
->
-> doc = nlp("A text about Apple.")
-> ents = [(ent.text, ent.label_) for ent in doc.ents]
-> assert ents == [("Apple", "ORG")]
-> ```
-
-| Name        | Description                                                          |
-| ----------- | -------------------------------------------------------------------- |
-| `doc`       | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
-| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~        |
-
-## EntityRuler.add_patterns {id="add_patterns",tag="method"}
-
-Add patterns to the entity ruler. A pattern can either be a token pattern (list
-of dicts) or a phrase pattern (string). For more details, see the usage guide on
-[rule-based matching](/usage/rule-based-matching).
-
-> #### Example
->
-> ```python
-> patterns = [
->     {"label": "ORG", "pattern": "Apple"},
->     {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}
-> ]
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns(patterns)
-> ```
-
-| Name       | Description                                                      |
-| ---------- | ---------------------------------------------------------------- |
-| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
-
-## EntityRuler.remove {id="remove",tag="method",version="3.2.1"}
-
-Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if
-the ID does not exist.
-
-> #### Example
->
-> ```python
-> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}]
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns(patterns)
-> ruler.remove("apple")
-> ```
-
-| Name | Description                         |
-| ---- | ----------------------------------- |
-| `id` | The ID of the pattern rule. ~~str~~ |
-
-## EntityRuler.to_disk {id="to_disk",tag="method"}
-
-Save the entity ruler patterns to a directory. The patterns will be saved as
-newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided,
-only the patterns are saved as JSONL. If a directory name is provided, a
-`patterns.jsonl` and `cfg` file with the component configuration is exported.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.to_disk("/path/to/patterns.jsonl")  # saves patterns only
-> ruler.to_disk("/path/to/entity_ruler")    # saves patterns and config
-> ```
-
-| Name   | Description                                                                                                                                              |
-| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-
-## EntityRuler.from_disk {id="from_disk",tag="method"}
-
-Load the entity ruler from a path. Expects either a file containing
-newline-delimited JSON (JSONL) with one entry per line, or a directory
-containing a `patterns.jsonl` file and a `cfg` file with the component
-configuration.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.from_disk("/path/to/patterns.jsonl")  # loads patterns only
-> ruler.from_disk("/path/to/entity_ruler")    # loads patterns and config
-> ```
-
-| Name        | Description                                                                                                   |
-| ----------- | ------------------------------------------------------------------------------------------------------------- |
-| `path`      | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~                                                            |
-
-## EntityRuler.to_bytes {id="to_bytes",tag="method"}
-
-Serialize the entity ruler patterns to a bytestring.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler_bytes = ruler.to_bytes()
-> ```
-
-| Name        | Description                        |
-| ----------- | ---------------------------------- |
-| **RETURNS** | The serialized patterns. ~~bytes~~ |
-
-## EntityRuler.from_bytes {id="from_bytes",tag="method"}
-
-Load the pipe from a bytestring. Modifies the object in place and returns it.
-
-> #### Example
->
-> ```python
-> ruler_bytes = ruler.to_bytes()
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.from_bytes(ruler_bytes)
-> ```
-
-| Name         | Description                                        |
-| ------------ | -------------------------------------------------- |
-| `bytes_data` | The bytestring to load. ~~bytes~~                  |
-| **RETURNS**  | The modified `EntityRuler` object. ~~EntityRuler~~ |
-
-## EntityRuler.labels {id="labels",tag="property"}
-
-All labels present in the match patterns.
-
-| Name        | Description                            |
-| ----------- | -------------------------------------- |
-| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.from_disk("patterns.jsonl")
++ import srsly
++ patterns = srsly.read_jsonl("patterns.jsonl")
++ ruler.add_patterns(patterns)
+```
 
-## EntityRuler.ent_ids {id="ent_ids",tag="property",version="2.2.2"}
+### Saving patterns
 
-All entity IDs present in the `id` properties of the match patterns.
+`SpanRuler.to_disk` always saves the full component data to a directory and does
+not include an option to save the patterns to a single JSONL file.
 
-| Name        | Description                         |
-| ----------- | ----------------------------------- |
-| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.to_disk("patterns.jsonl")
++ import srsly
++ srsly.write_jsonl("patterns.jsonl", ruler.patterns)
+```
 
-## EntityRuler.patterns {id="patterns",tag="property"}
+### Accessing token and phrase patterns
 
-Get all patterns that were added to the entity ruler.
+The separate token patterns and phrase patterns are no longer accessible under
+`ruler.token_patterns` or `ruler.phrase_patterns`. You can access the combined
+patterns in their original format using the property
+[`SpanRuler.patterns`](/api/spanruler#patterns).
 
-| Name        | Description                                                                              |
-| ----------- | ---------------------------------------------------------------------------------------- |
-| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ |
+### Removing patterns by ID
 
-## Attributes {id="attributes"}
+[`SpanRuler.remove`](/api/spanruler#remove) removes by label rather than ID. To
+remove by ID, use [`SpanRuler.remove_by_id`](/api/spanruler#remove_by_id):
 
-| Name              | Description                                                                                                           |
-| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `matcher`         | The underlying matcher used to process token patterns. ~~Matcher~~                                                    |
-| `phrase_matcher`  | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~                                      |
-| `token_patterns`  | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
-| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~                             |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.remove("id")
++ ruler.remove_by_id("id")
+```
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 2e5545f0df2..765b786996c 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1410,10 +1410,10 @@ print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
 print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
 ```
 
-If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
-patterns, the `id_` property of the matched entity is set to the `id` given
-in the patterns. So in the example above it's easy to identify that "San
-Francisco" and "San Fran" are both the same entity.
+If the `id` attribute is included in the [`entity_ruler`](/api/entityruler)
+patterns, the `id_` property of the matched entity is set to the `id` given in
+the patterns. So in the example above it's easy to identify that "San Francisco"
+and "San Fran" are both the same entity.
 
 ### Using pattern files {id="entityruler-files"}
 
diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx
index a491b182c6e..97ae3c5e573 100644
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@@ -187,9 +187,9 @@ the data to and from a JSON file.
 
 > #### Real-world example
 >
-> To see custom serialization methods in action, check out the new
-> [`EntityRuler`](/api/entityruler) component and its
-> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the
+> To see custom serialization methods in action, check out the
+> [`SpanRuler`](/api/spanruler) component and its
+> [source](%%GITHUB_SPACY/spacy/pipeline/span_ruler.py). Patterns added to the
 > component will be saved to a `.jsonl` file if the pipeline is serialized to
 > disk, and to a bytestring if the pipeline is serialized to bytes. This allows
 > saving out a pipeline with rule-based components _with_ all the component

From 548611613714bea23e8800ceae997e13c38acd4b Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Wed, 23 Nov 2022 13:09:32 +0100
Subject: [PATCH 347/504] Remove sentiment extension (#11722)

* remove sentiment attribute

* remove sentiment from docs

* add test for backwards compatibility

* replace from_disk with from_bytes

* Fix docs and format file

* Fix formatting
---
 spacy/tokens/doc.pxd                        | 2 --
 spacy/tokens/doc.pyi                        | 1 -
 website/docs/api/doc.mdx                    | 1 -
 website/docs/api/span.mdx                   | 1 -
 website/docs/api/token.mdx                  | 1 -
 website/docs/usage/processing-pipelines.mdx | 4 ++--
 6 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index fc0404f1423..9fb6a72c87f 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -50,8 +50,6 @@ cdef class Doc:
 
     cdef public dict activations
 
-    cdef public dict activations
-
     cdef public dict user_hooks
     cdef public dict user_token_hooks
     cdef public dict user_span_hooks
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index d83aa0e5486..e62854f77ab 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -37,7 +37,6 @@ class Doc:
     spans: SpanGroups
     max_length: int
     length: int
-    sentiment: float
     activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
     cats: Dict[str, float]
     user_hooks: Dict[str, Callable[..., Any]]
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index 842b2181a81..e92c0e833e0 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -762,7 +762,6 @@ The L2 norm of the document's vector representation.
 | `user_data`                                | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                               |
 | `lang` <Tag variant="new">2.1</Tag>        | Language of the document's vocabulary. ~~int~~                                                                                                 |
 | `lang_` <Tag variant="new">2.1</Tag>       | Language of the document's vocabulary. ~~str~~                                                                                                 |
-| `sentiment`                                | The document's positivity/negativity score, if available. ~~float~~                                                                            |
 | `user_hooks`                               | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                      |
 | `user_token_hooks`                         | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                              |
 | `user_span_hooks`                          | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                               |
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index b1a9bea200e..cd70d8dcead 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -567,5 +567,4 @@ overlaps with will be returned.
 | `ent_id_`                               | Alias for `id_`: the span's ID. ~~str~~                                                                                       |
 | `id`                                    | The hash value of the span's ID. ~~int~~                                                                                      |
 | `id_`                                   | The span's ID. ~~str~~                                                                                                        |
-| `sentiment`                             | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
 | `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
diff --git a/website/docs/api/token.mdx b/website/docs/api/token.mdx
index 12b99394350..16d421c12f4 100644
--- a/website/docs/api/token.mdx
+++ b/website/docs/api/token.mdx
@@ -470,7 +470,6 @@ The L2 norm of the token's vector representation.
 | `lang_`                                      | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
 | `prob`                                       | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
 | `idx`                                        | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
-| `sentiment`                                  | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
 | `lex_id`                                     | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `rank`                                       | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index 9dbdadd0ebc..d830f56b58e 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -1389,8 +1389,8 @@ Writing to a `._` attribute instead of to the `Doc` directly keeps a clearer
 separation and makes it easier to ensure backwards compatibility. For example,
 if you've implemented your own `.coref` property and spaCy claims it one day,
 it'll break your code. Similarly, just by looking at the code, you'll
-immediately know what's built-in and what's custom – for example, `doc.lang` is
-spaCy, while `doc._.language` isn't.
+immediately know what's built-in and what's custom – for example,
+`doc.lang` is spaCy, while `doc._.language` isn't.
 
 </Accordion>
 

From cad4a812d2a53a34b42ff4d1bbcf871fe1bc050c Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 5 Dec 2022 08:57:24 +0100
Subject: [PATCH 348/504] prettier formatting

---
 website/docs/api/cli.mdx                    | 27 +++++++++++----------
 website/docs/usage/processing-pipelines.mdx |  4 +--
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index cfa99a2b350..4a11efbaa0f 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1341,20 +1341,21 @@ be provided.
 > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
 > ```
 
-| Name                     | Description                                                                                                                                                                                            |
-| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                  | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                             |
-| `data_path`              | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                                  |
-| `pipe_name`              | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                                           |
-| `threshold_key`          | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                                          |
-| `scores_key`             | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                                              |
-| `--n_trials`, `-n`       | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                                     |
-| `--code`, `-c`           | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gpu-id`, `-g`         | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
-| `--gold-preproc`, `-G`   | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                                                |
-| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                                       |
-| `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
+| Name                    | Description                                                                                                                                                                          |
+| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                 | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
+| `data_path`             | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
+| `pipe_name`             | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
+| `threshold_key`         | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
+| `scores_key`            | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
+| `--n_trials`, `-n`      | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
+| `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gpu-id`, `-g`        | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--gold-preproc`, `-G`  | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
+| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 
+## assemble {#assemble tag="command"}
 ## assemble {#assemble tag="command"}
 
 Assemble a pipeline from a config file without additional training. Expects a
diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx
index d830f56b58e..9dbdadd0ebc 100644
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@@ -1389,8 +1389,8 @@ Writing to a `._` attribute instead of to the `Doc` directly keeps a clearer
 separation and makes it easier to ensure backwards compatibility. For example,
 if you've implemented your own `.coref` property and spaCy claims it one day,
 it'll break your code. Similarly, just by looking at the code, you'll
-immediately know what's built-in and what's custom – for example,
-`doc.lang` is spaCy, while `doc._.language` isn't.
+immediately know what's built-in and what's custom – for example, `doc.lang` is
+spaCy, while `doc._.language` isn't.
 
 </Accordion>
 

From d215bb2c346c8446891fe0898ebf25827950efa9 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 5 Dec 2022 17:43:23 +0900
Subject: [PATCH 349/504] Switch ubuntu-latest to ubuntu-20.04 in main tests
 (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6
---
 azure-pipelines.yml | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 99f1b8afffe..0f7ea91f96f 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -29,7 +29,7 @@ jobs:
     steps:
       - task: UsePythonVersion@0
         inputs:
-          versionSpec: "3.8"
+          versionSpec: "3.7"
       - script: |
           pip install flake8==5.0.4
           python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
@@ -40,6 +40,24 @@ jobs:
     strategy:
       matrix:
         # We're only running one platform per Python version to speed up builds
+        Python36Linux:
+          imageName: "ubuntu-20.04"
+          python.version: "3.6"
+        #        Python36Windows:
+        #          imageName: "windows-latest"
+        #          python.version: "3.6"
+        #        Python36Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.6"
+        #        Python37Linux:
+        #          imageName: "ubuntu-20.04"
+        #          python.version: "3.7"
+        Python37Windows:
+          imageName: "windows-latest"
+          python.version: "3.7"
+        #        Python37Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.7"
         #        Python38Linux:
         #          imageName: "ubuntu-latest"
         #          python.version: "3.8"

From d5cf51599e64f181433f1e2268db50a2152db299 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 8 Dec 2022 19:45:52 +0900
Subject: [PATCH 350/504] Remove old model shortcuts (#11916)

* Remove old model shortcuts

* Remove error, docs warnings about shortcuts

* Fix import in util

Accidentally deleted the whole import and not just the old part...

* Change universe example to v3 style

* Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6

* Update some model loading in Universe

* Add v2 tag to neuralcoref

* Use the spacy-version feature instead of a v2 tag

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/cli/download.py | 12 ++----------
 spacy/errors.py       |  9 ---------
 spacy/util.py         |  2 +-
 3 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 5e460717cc4..0b8ed54ed3c 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,16 +7,8 @@
 from wasabi import msg
 
 from .. import about
-from ..errors import OLD_MODEL_SHORTCUTS
-from ..util import (
-    get_minor_version,
-    is_in_interactive,
-    is_in_jupyter,
-    is_package,
-    is_prerelease_version,
-    run_command,
-)
-from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
+from ..util import is_package, get_minor_version, run_command
+from ..util import is_prerelease_version
 
 
 @app.command(
diff --git a/spacy/errors.py b/spacy/errors.py
index de47325332a..a89a81db427 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -988,15 +988,6 @@ class Errors(metaclass=ErrorsWithCodes):
              "but got '{received_type}'")
 
 
-# Deprecated model shortcuts, only used in errors and warnings
-OLD_MODEL_SHORTCUTS = {
-    "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
-    "pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
-    "nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
-    "lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
-}
-
-
 # fmt: on
 
 
diff --git a/spacy/util.py b/spacy/util.py
index fdc02a717cc..4f4718af5ff 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -61,7 +61,7 @@
 
 from .symbols import ORTH
 from .compat import cupy, CudaStream, is_windows, importlib_metadata
-from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
+from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, is_windows
 from .errors import Errors, Warnings

From fbdb77760ed89d8913136c011cec23896d7b7ba6 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 8 Dec 2022 13:24:45 +0100
Subject: [PATCH 351/504] Remove unused, experimental multi-task components
 (#11919)

* Remove experimental multi-task components

These are incomplete implementations and are not usable in their current state.

* Remove orphaned error message

* Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6

* Revert "Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)"

This reverts commit 77c0fd7b176be80e8438fa21440a85d1fe26e39b.

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
---
 spacy/errors.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index a89a81db427..079f2efb7ca 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -256,8 +256,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "https://spacy.io/usage/models")
     E011 = ("Unknown operator: '{op}'. Options: {opts}")
     E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
-    E016 = ("MultitaskObjective target should be function or one of: dep, "
-            "tag, ent, dep_tag_offset, ent_tag.")
     E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
     E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
             "refers to an issue with the `Vocab` or `StringStore`.")

From e6054ae592767520a6d0e4e66c6fce4dc4951756 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 12 Dec 2022 08:55:53 +0100
Subject: [PATCH 352/504] Custom extensions for spans with equal boundaries
 (#11429)

* Init

* Fix return type for mypy

* adjust types and improve setting new attributes

* Add underscore changes to json conversion

* Add test and underscore changes to from_docs

* add underscore changes and test to span.to_doc

* update return values

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Add types to function

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* adjust formatting

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* shorten return type

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* add helper function to improve readability

* Improve code and add comments

* rerun azure tests

* Fix tests for json conversion

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tests/doc/test_underscore.py |  4 ++++
 spacy/tokens/doc.pyx               |  2 +-
 spacy/tokens/span.pyx              | 23 +++++++++++++++++++----
 spacy/tokens/underscore.py         |  7 ++-----
 4 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index 3ab7de76323..9c73907780c 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -4,6 +4,10 @@
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
+# Helper functions
+def _get_tuple(s: Span):
+    return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
+
 
 # Helper functions
 def _get_tuple(s: Span):
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 169199bc563..3c7b728f41b 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1771,7 +1771,7 @@ cdef class Doc:
                                     data["underscore_span"] = {}
                                 if attr not in data["underscore_span"]:
                                     data["underscore_span"][attr] = []
-                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id": _span_id})
+                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id":_span_id})
 
             for attr in underscore:
                 if attr not in user_keys:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 409180b4973..1c19cc6d495 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -263,7 +263,7 @@ cdef class Span:
         """Custom extension attributes registered via `set_extension`."""
         cdef SpanC* span_c = self.span_c()
         return Underscore(Underscore.span_extensions, self,
-                          start=span_c.start_char, end=span_c.end_char)
+                          start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
 
     def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None):
         """Create a `Doc` object with a copy of the `Span`'s data.
@@ -885,7 +885,12 @@ cdef class Span:
             return self.span_c().label
 
         def __set__(self, attr_t label):
-            self.span_c().label = label
+            if label != self.span_c().label :
+                old_label = self.span_c().label
+                self.span_c().label = label
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id)
+                Underscore._replace_keys(old, new)
 
     property kb_id:
         def __get__(self):
@@ -893,7 +898,12 @@ cdef class Span:
             return self.span_c().kb_id
 
         def __set__(self, attr_t kb_id):
-            self.span_c().kb_id = kb_id
+            if kb_id != self.span_c().kb_id :
+                old_kb_id = self.span_c().kb_id
+                self.span_c().kb_id = kb_id
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id)
+                Underscore._replace_keys(old, new)
 
     property id:
         def __get__(self):
@@ -901,7 +911,12 @@ cdef class Span:
             return self.span_c().id
 
         def __set__(self, attr_t id):
-            self.span_c().id = id
+            if id != self.span_c().id :
+                old_id = self.span_c().id
+                self.span_c().id = id
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id)
+                Underscore._replace_keys(old, new)
 
     property ent_id:
         """Alias for the span's ID."""
diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index c3e3641d454..63706851286 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -3,10 +3,10 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 from ..errors import Errors
+from .span import Span
 
 if TYPE_CHECKING:
     from .doc import Doc
-    from .span import Span
     from .token import Token
 
 
@@ -40,10 +40,7 @@ def __init__(
         object.__setattr__(self, "_doc", obj.doc)
         object.__setattr__(self, "_start", start)
         object.__setattr__(self, "_end", end)
-        # We used to check if obj is a span, however, this introduces an
-        # import cycle between the span and underscore modeles. So we
-        # do a structural type check instead.
-        if hasattr(obj, "id") and hasattr(obj, "label") and hasattr(obj, "kb_id"):
+        if type(obj) == Span:
             object.__setattr__(self, "_label", label)
             object.__setattr__(self, "_kb_id", kb_id)
             object.__setattr__(self, "_span_id", span_id)

From a19fdd731525d077c1c121bbca8658fe329fa8a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Sat, 17 Dec 2022 14:32:19 +0100
Subject: [PATCH 353/504] Fix v4 branch to build against Thinc v9 (#11921)

* Move `thinc.extra.search` to `spacy.pipeline._parser_internals`

Backport of:
https://github.com/explosion/spaCy/pull/11317

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Replace references to `thinc.backends.linalg` with `CBlas`

Backport of:
https://github.com/explosion/spaCy/pull/11292

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Use cross entropy from `thinc.legacy`

* Require thinc>=9.0.0.dev0,<9.1.0

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 pyproject.toml                                |  5 +-
 requirements.txt                              |  2 +-
 setup.cfg                                     |  4 +-
 spacy/ml/parser_model.pyx                     |  4 +-
 .../_parser_internals/_beam_utils.pxd         |  1 -
 .../_parser_internals/_beam_utils.pyx         |  3 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |  7 +-
 spacy/pipeline/_parser_internals/ner.pyx      |  2 +
 spacy/pipeline/_parser_internals/search.pxd   |  7 +-
 spacy/pipeline/_parser_internals/search.pyx   | 13 ++--
 spacy/pipeline/edit_tree_lemmatizer.py        |  7 +-
 spacy/pipeline/morphologizer.pyx              |  5 +-
 spacy/pipeline/senter.pyx                     |  6 +-
 spacy/pipeline/tagger.pyx                     | 11 ++-
 spacy/pipeline/transition_parser.pyx          | 21 +++---
 spacy/tests/conftest.py                       |  5 ++
 spacy/tests/parser/_search.pyx                | 69 +++++++++----------
 17 files changed, 86 insertions(+), 86 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index bfd7e68d1f7..77e1471426f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,9 +5,8 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.2.2,<8.3.0",
-    "numpy>=1.15.0; python_version < '3.9'",
-    "numpy>=1.25.0; python_version >= '3.9'",
+    "thinc>=9.0.0.dev0,<9.1.0",
+    "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index 4b58e75506d..26ef5625339 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.2.2,<8.3.0
+thinc>=9.0.0.dev0,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 9b847bb57ab..70f1a3e0513 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,8 +37,8 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.2.2,<8.3.0
-    wasabi>=0.9.1,<1.2.0
+    thinc>=9.0.0.dev0,<9.1.0
+    wasabi>=0.9.1,<1.1.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
     weasel>=0.1.0,<0.5.0
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 843275f4c8b..7f18ea1ba7f 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -2,7 +2,6 @@
 cimport numpy as np
 from libc.math cimport exp
 from libc.stdlib cimport calloc, free, realloc
-from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport saxpy, sgemm
 
 import numpy
@@ -101,8 +100,7 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
     sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n.states,
                        n.feats, n.hiddens * n.pieces)
     for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1,
-                     &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
         for j in range(n.hiddens):
             index = i * n.hiddens * n.pieces + j * n.pieces
             which = _arg_max(&A.unmaxed[index], n.pieces)
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pxd b/spacy/pipeline/_parser_internals/_beam_utils.pxd
index 5a452e56a88..571f246b1e3 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd
@@ -1,6 +1,5 @@
 from ...typedefs cimport class_t, hash_t
 
-
 # These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
 
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index 273cc6c1078..84e30bab396 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -3,15 +3,14 @@
 cimport numpy as np
 
 import numpy
+from cpython.ref cimport PyObject, Py_XDECREF
 
 from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
 from .search cimport Beam, MaxViolation
-
 from .search import MaxViolation
-
 from .stateclass cimport StateC, StateClass
 
 
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 6ffceae10d3..11e3f483a79 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -18,7 +18,6 @@ from ._state cimport ArcC, StateC
 from .stateclass cimport StateClass
 
 from ...errors import Errors
-
 from .search cimport Beam
 
 
@@ -328,7 +327,7 @@ cdef class Shift:
     * At least two words in sentence
     * Word has not been shifted before
 
-    Cost: push_cost 
+    Cost: push_cost
 
     Action:
     * Mark B[0] as 'shifted'
@@ -507,8 +506,8 @@ cdef class RightArc:
 
 
 cdef class Break:
-    """Mark the second word of the buffer as the start of a 
-    sentence. 
+    """Mark the second word of the buffer as the start of a
+    sentence.
 
     Validity:
     * len(buffer) >= 2
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 7577f3f18b3..324a497c9fb 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -22,6 +22,8 @@ from ...typedefs cimport attr_t, weight_t
 from ...training import split_bilu_label
 
 from ...training.example cimport Example
+from .search cimport Beam
+from .stateclass cimport StateClass
 from ._state cimport StateC
 from .search cimport Beam
 from .stateclass cimport StateClass
diff --git a/spacy/pipeline/_parser_internals/search.pxd b/spacy/pipeline/_parser_internals/search.pxd
index ad68dc5c718..de6a887bed5 100644
--- a/spacy/pipeline/_parser_internals/search.pxd
+++ b/spacy/pipeline/_parser_internals/search.pxd
@@ -1,10 +1,12 @@
 from cymem.cymem cimport Pool
-from libc.stdint cimport uint32_t, uint64_t
+
+from libc.stdint cimport uint32_t
+from libc.stdint cimport uint64_t
 from libcpp.pair cimport pair
 from libcpp.queue cimport priority_queue
 from libcpp.vector cimport vector
 
-from ...typedefs cimport class_t, hash_t, weight_t
+from ...typedefs cimport class_t, weight_t, hash_t
 
 ctypedef pair[weight_t, size_t] Entry
 ctypedef priority_queue[Entry] Queue
@@ -58,6 +60,7 @@ cdef class Beam:
                      void* extra_args) except -1
     cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
 
+
     cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
         self.scores[i][j] = score
         self.is_valid[i][j] = is_valid
diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
index 52d5cdaa891..1d9b6dd7adf 100644
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -1,8 +1,10 @@
-# cython: experimental_cpp_class_def=True, cdivision=True, infer_types=True
+# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
+from libc.string cimport memset, memcpy
+from libc.math cimport log, exp
+import math
+
 from cymem.cymem cimport Pool
-from libc.math cimport exp
-from libc.string cimport memcpy, memset
 from preshed.maps cimport PreshMap
 
 
@@ -67,7 +69,7 @@ cdef class Beam:
             self.costs[i][j] = costs[j]
 
     cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
-        cdef int i
+        cdef int i, j
         for i in range(self.width):
             memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
             memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
@@ -173,6 +175,7 @@ cdef class Beam:
         beam-width, and n is the number of classes.
         """
         cdef Entry entry
+        cdef weight_t score
         cdef _State* s
         cdef int i, j, move_id
         assert self.size >= 1
@@ -265,7 +268,7 @@ cdef class MaxViolation:
                 # This can happen from non-monotonic actions
                 # If we find a better gold analysis this way, be sure to keep it.
                 elif pred._states[i].loss <= 0 \
-                        and tuple(pred.histories[i]) not in seen_golds:
+                and tuple(pred.histories[i]) not in seen_golds:
                     g_scores.append(pred._states[i].score)
                     g_hist.append(list(pred.histories[i]))
             for i in range(gold.size):
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 1f16b44cfed..95ffecc6ae8 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -4,8 +4,9 @@
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+from thinc.api import Config, Model
 from thinc.types import ArrayXd, Floats2d, Ints1d
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 
 from .. import util
 from ..errors import Errors
@@ -137,7 +138,9 @@ def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
     ) -> Tuple[float, List[Floats2d]]:
         validate_examples(examples, "EditTreeLemmatizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
+        loss_func = LegacySequenceCategoricalCrossentropy(
+            normalize=False, missing_value=-1
+        )
 
         truths = []
         for eg in examples:
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index c26be7912a9..92c1fed8efb 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,7 +1,8 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Callable, Dict, Iterable, List, Optional, Union
 import srsly
-from thinc.api import SequenceCategoricalCrossentropy, Model, Config
+from thinc.api import Model, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 from itertools import islice
 from typing import Callable, Dict, Iterable, Optional, Union
@@ -324,7 +325,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index dd56b4a62e6..d8fdf8f739f 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -4,7 +4,9 @@ from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Union
 
 import srsly
-from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.api import Model, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+
 from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
@@ -184,7 +186,7 @@ class SentenceRecognizer(Tagger):
         """
         validate_examples(examples, "SentenceRecognizer.get_loss")
         labels = self.labels
-        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=labels, normalize=False)
         truths = []
         for eg in examples:
             eg_truth = []
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 95016072ef3..c641ccd6f32 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -2,7 +2,8 @@
 from typing import Callable, Dict, Iterable, List, Optional, Union
 import numpy
 import srsly
-from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.api import Model, set_dropout_rate, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 import warnings
 from itertools import islice
@@ -272,6 +273,7 @@ class Tagger(TrainablePipe):
 
         DOCS: https://spacy.io/api/tagger#rehearse
         """
+        loss_func = LegacySequenceCategoricalCrossentropy()
         if losses is None:
             losses = {}
         losses.setdefault(self.name, 0.0)
@@ -322,12 +324,7 @@ class Tagger(TrainablePipe):
         DOCS: https://spacy.io/api/tagger#get_loss
         """
         validate_examples(examples, "Tagger.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(
-            names=self.labels,
-            normalize=False,
-            neg_prefix=self.cfg["neg_prefix"],
-            label_smoothing=self.cfg["label_smoothing"]
-        )
+        loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
         # Convert empty tag "" to missing value None so that both misaligned
         # tokens and tokens with missing annotation have the default missing
         # value None.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index c728f1b7909..9c3543101bc 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -15,6 +15,9 @@ from libcpp.vector cimport vector
 
 import random
 
+import srsly
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
+import numpy.random
 import numpy
 import numpy.random
 import srsly
@@ -31,18 +34,12 @@ from thinc.api import (
 )
 from thinc.types import Floats2d
 
-from ..ml.parser_model cimport (
-    ActivationsC,
-    SizesC,
-    WeightsC,
-    alloc_activations,
-    arg_max_if_valid,
-    cpu_log_loss,
-    free_activations,
-    get_c_sizes,
-    get_c_weights,
-    predict_states,
-)
+from ._parser_internals.stateclass cimport StateClass
+from ._parser_internals.search cimport Beam
+from ..ml.parser_model cimport alloc_activations, free_activations
+from ..ml.parser_model cimport predict_states, arg_max_if_valid
+from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
+from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
 from ._parser_internals.stateclass cimport StateClass
 
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 28551f9ee63..9a22f4a5a7e 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -4,7 +4,12 @@
 import sys
 
 import pytest
+from spacy.util import get_lang_class
+import functools
 from hypothesis import settings
+import inspect
+import importlib
+import sys
 
 from spacy.util import get_lang_class
 
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
index ca2a2916094..23fc8164412 100644
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@@ -1,18 +1,15 @@
 # cython: infer_types=True, binding=True
-from cymem.cymem cimport Pool
-
 from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
-from spacy.typedefs cimport class_t
-
-import pytest
+from spacy.typedefs cimport class_t, weight_t
+from cymem.cymem cimport Pool
 
 from ..conftest import cytest
-
+import pytest
 
 cdef struct TestState:
     int length
     int x
-    char *string
+    Py_UNICODE* string
 
 
 cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1:
@@ -22,7 +19,7 @@ cdef int transition(void* dest, void* src, class_t clas, void* extra_args) excep
     dest_state.x = src_state.x
     dest_state.x += clas
     if extra_args != NULL:
-        dest_state.string = <char *>extra_args
+        dest_state.string = <Py_UNICODE*>extra_args
     else:
         dest_state.string = src_state.string
 
@@ -32,9 +29,9 @@ cdef void* initialize(Pool mem, int n, void* extra_args) except NULL:
     state.length = n
     state.x = 1
     if extra_args == NULL:
-        state.string = 'default'
+        state.string = u'default'
     else:
-        state.string = <char *>extra_args
+        state.string = <Py_UNICODE*>extra_args
     return state
 
 
@@ -42,58 +39,54 @@ cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
     state = <TestState*>state
     mem.free(state)
 
-
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width",
-                         [
-                             (2, 3),
-                             (3, 6),
-                             (4, 20),
-                         ]
-                         )
+    [
+        (2, 3),
+        (3, 6),
+        (4, 20),
+    ]
+)
 def test_init(nr_class, beam_width):
     b = Beam(nr_class, beam_width)
     assert b.size == 1
     assert b.width == beam_width
     assert b.nr_class == nr_class
 
-
 @cytest
 def test_init_violn():
     MaxViolation()
 
-
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length",
-                         [
-                             (2, 3, 3),
-                             (3, 6, 15),
-                             (4, 20, 32),
-                         ]
-                         )
+    [
+        (2, 3, 3),
+        (3, 6, 15),
+        (4, 20, 32),
+    ]
+)
 def test_initialize(nr_class, beam_width, length):
     b = Beam(nr_class, beam_width)
     b.initialize(initialize, destroy, length, NULL)
     for i in range(b.width):
         s = <TestState*>b.at(i)
         assert s.length == length, s.length
-        assert s.string.decode('utf8') == 'default'
+        assert s.string == 'default'
 
 
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length,extra",
-                         [
-                             (2, 3, 4, None),
-                             (3, 6, 15, u"test beam 1"),
-                         ]
-                         )
+    [
+        (2, 3, 4, None),
+        (3, 6, 15, u"test beam 1"),
+    ]
+)
 def test_initialize_extra(nr_class, beam_width, length, extra):
-    extra = extra.encode("utf-8") if extra is not None else None
     b = Beam(nr_class, beam_width)
     if extra is None:
         b.initialize(initialize, destroy, length, NULL)
     else:
-        b.initialize(initialize, destroy, length, <void*><char*>extra)
+        b.initialize(initialize, destroy, length, <void*><Py_UNICODE*>extra)
     for i in range(b.width):
         s = <TestState*>b.at(i)
         assert s.length == length
@@ -101,11 +94,11 @@ def test_initialize_extra(nr_class, beam_width, length, extra):
 
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length",
-                         [
-                             (3, 6, 15),
-                             (4, 20, 32),
-                         ]
-                         )
+    [
+        (3, 6, 15),
+        (4, 20, 32),
+    ]
+)
 def test_transition(nr_class, beam_width, length):
     b = Beam(nr_class, beam_width)
     b.initialize(initialize, destroy, length, NULL)

From ac255862eb1fd28c1016fae1381d0a9797623da8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 22 Dec 2022 10:23:31 +0100
Subject: [PATCH 354/504] Fix fallout from a previous merge

---
 spacy/pipeline/textcat_multilabel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index d38beb441da..4c165e02b03 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -175,6 +175,7 @@ def __init__(
         name (str): The component instance name, used to add entries to the
             losses during training.
         threshold (float): Cutoff to consider a prediction "positive".
+        scorer (Optional[Callable]): The scoring method.
         save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/textcategorizer#init

From 93ce0f0285d225b7aeb80d7e5d30baaa4bedb1c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 29 Dec 2022 08:03:24 +0100
Subject: [PATCH 355/504] Adjust to new `Schedule` class and pass scores to
 `Optimizer` (#12008)

* Adjust to new `Schedule` class and pass scores to `Optimizer`

Requires https://github.com/explosion/thinc/pull/804

* Bump minimum Thinc requirement to 9.0.0.dev1
---
 pyproject.toml             |  2 +-
 requirements.txt           |  2 +-
 setup.cfg                  |  4 ++--
 spacy/training/batchers.py | 39 ++++++++++++++++----------------------
 spacy/training/loop.py     |  2 +-
 spacy/util.py              | 13 +++++++++----
 6 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 77e1471426f..58e4382e829 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev0,<9.1.0",
+    "thinc>=9.0.0.dev1,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 26ef5625339..0325dda2ee9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev0,<9.1.0
+thinc>=9.0.0.dev1,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 70f1a3e0513..2c1a0591724 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,8 +37,8 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev0,<9.1.0
-    wasabi>=0.9.1,<1.1.0
+    thinc>=9.0.0.dev1,<9.1.0
+    wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
     weasel>=0.1.0,<0.5.0
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index 21f1b29f5a2..9557ffb4eca 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,23 +1,11 @@
 import itertools
-from functools import partial
-from typing import (
-    Any,
-    Callable,
-    Iterable,
-    Iterator,
-    List,
-    Optional,
-    Sequence,
-    TypeVar,
-    Union,
-)
+from thinc.schedules import Schedule, constant as constant_schedule
 
 from thinc.schedules import Schedule
 
 from ..util import minibatch, registry
 
-SizingSchedule = Union[Iterable[int], int, Schedule]
-Sizing = Union[Iterable[int], int]
+Sizing = Union[Sequence[int], int, Schedule[int]]
 ItemT = TypeVar("ItemT")
 BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
 
@@ -122,12 +110,13 @@ def minibatch_by_padded_size(
         The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_: Iterator[int] = itertools.repeat(size)
+        size_ = constant_schedule(size)
     else:
-        size_ = iter(size)
-    for outer_batch in minibatch(seqs, size=buffer):
+        assert isinstance(size, Schedule)
+        size_ = size
+    for step, outer_batch in enumerate(minibatch(seqs, size=buffer)):
         outer_batch = list(outer_batch)
-        target_size = next(size_)
+        target_size = size_(step)
         for indices in _batch_by_length(outer_batch, target_size, get_length):
             subbatch = [outer_batch[i] for i in indices]
             padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
@@ -158,10 +147,12 @@ def minibatch_by_words(
         item. The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_: Iterator[int] = itertools.repeat(size)
+        size_ = constant_schedule(size)
     else:
-        size_ = iter(size)
-    target_size = next(size_)
+        assert isinstance(size, Schedule)
+        size_ = size
+    step = 0
+    target_size = size_(step)
     tol_size = target_size * tolerance
     batch = []
     overflow = []
@@ -186,7 +177,8 @@ def minibatch_by_words(
         else:
             if batch:
                 yield batch
-            target_size = next(size_)
+            step += 1
+            target_size = size_(step)
             tol_size = target_size * tolerance
             batch = overflow
             batch_size = overflow_size
@@ -204,7 +196,8 @@ def minibatch_by_words(
             else:
                 if batch:
                     yield batch
-                target_size = next(size_)
+                step += 1
+                target_size = size_(step)
                 tol_size = target_size * tolerance
                 batch = [seq]
                 batch_size = n_words
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 575a583b78c..0f8d561b9a6 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -489,7 +489,7 @@ def train_while_improving(
                     score, other_scores = evaluate()
             else:
                 score, other_scores = evaluate()
-            optimizer.last_score = score  # type: ignore[assignment]
+            optimizer.last_score = score
             results.append((score, step))
             is_best_checkpoint = score == max(results)[0]
         else:
diff --git a/spacy/util.py b/spacy/util.py
index 4f4718af5ff..a76e8f73eeb 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -2,7 +2,12 @@
 import importlib
 import importlib.metadata
 import importlib.util
-import inspect
+import re
+from pathlib import Path
+import thinc
+from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
+from thinc.api import ConfigValidationError, Model, constant as constant_schedule
+import functools
 import itertools
 import logging
 import os
@@ -1617,12 +1622,12 @@ def minibatch(items, size):
     so that batch-size can vary on each step.
     """
     if isinstance(size, int):
-        size_ = itertools.repeat(size)
+        size_ = constant_schedule(size)
     else:
         size_ = iter(size)
     items = iter(items)
-    while True:
-        batch_size = next(size_)
+    for step in itertools.count():
+        batch_size = size_(step)
         batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break

From 807aa826a49cc88171fe8d61a39872c86275c286 Mon Sep 17 00:00:00 2001
From: Tetsuo Kiso <tetsuok@users.noreply.github.com>
Date: Wed, 4 Jan 2023 01:43:09 +0900
Subject: [PATCH 356/504] Delete unused imports for StringStore (#12040)

---
 spacy/lexeme.pxd    | 17 ++++-------------
 spacy/tokenizer.pxd |  4 ++++
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index ff51d77e8a9..2d14edcd6b0 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,19 +1,10 @@
 from numpy cimport ndarray
 
-from .attrs cimport (
-    ID,
-    LANG,
-    LENGTH,
-    LOWER,
-    NORM,
-    ORTH,
-    PREFIX,
-    SHAPE,
-    SUFFIX,
-    attr_id_t,
-)
+from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
+from .attrs cimport attr_id_t
+from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
+
 from .structs cimport LexemeC
-from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
 from .vocab cimport Vocab
 
 
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index b2e50969462..2610532b75d 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -2,6 +2,10 @@ from cymem.cymem cimport Pool
 from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 
+from .typedefs cimport hash_t
+from .structs cimport LexemeC, SpanC, TokenC
+from .tokens.doc cimport Doc
+from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
 from .structs cimport LexemeC, SpanC, TokenC
 from .tokens.doc cimport Doc

From 4a9528c96721f5b0e8192daed6acbdb183c9aef0 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 11 Jan 2023 18:57:50 +0100
Subject: [PATCH 357/504] update tests from master to follow v4 principles

---
 spacy/tests/pipeline/test_entity_ruler.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 31feb73edde..8e64ec1ba18 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -357,6 +357,7 @@ def test_entity_ruler_overlapping_spans(nlp):
     assert doc.ents[0].label_ == "FOOBAR"
 
 
+@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy_pipe(nlp):
     ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
@@ -366,6 +367,7 @@ def test_entity_ruler_fuzzy_pipe(nlp):
     assert doc.ents[0].label_ == "HELLO"
 
 
+@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy(nlp):
     ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
@@ -375,12 +377,14 @@ def test_entity_ruler_fuzzy(nlp):
     assert doc.ents[0].label_ == "HELLO"
 
 
+@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy_disabled(nlp):
     @registry.misc("test_fuzzy_compare_disabled")
     def make_test_fuzzy_compare_disabled():
         return lambda x, y, z: False
 
     ruler = nlp.add_pipe(
+        "entity_ruler",
         "entity_ruler",
         config={"matcher_fuzzy_compare": {"@misc": "test_fuzzy_compare_disabled"}},
     )

From 78bca5356c9f88994434722e3cb2416f74d20b37 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 11 Jan 2023 19:04:06 +0100
Subject: [PATCH 358/504] update tests from master to follow v4 principles (2)

---
 spacy/tests/pipeline/test_entity_ruler.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 8e64ec1ba18..74731140688 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -357,7 +357,6 @@ def test_entity_ruler_overlapping_spans(nlp):
     assert doc.ents[0].label_ == "FOOBAR"
 
 
-@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy_pipe(nlp):
     ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
@@ -367,7 +366,6 @@ def test_entity_ruler_fuzzy_pipe(nlp):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy(nlp):
     ruler = nlp.add_pipe("entity_ruler")
     patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
@@ -377,7 +375,6 @@ def test_entity_ruler_fuzzy(nlp):
     assert doc.ents[0].label_ == "HELLO"
 
 
-@pytest.mark.parametrize()
 def test_entity_ruler_fuzzy_disabled(nlp):
     @registry.misc("test_fuzzy_compare_disabled")
     def make_test_fuzzy_compare_disabled():

From ebd7837ae16d482ab9fa59f404e33bca074603d4 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 13 Jan 2023 11:14:58 +0100
Subject: [PATCH 359/504] fix anchors (#12095)

---
 website/docs/api/stringstore.mdx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index 9b63d586b87..b5218ecd7a4 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -112,7 +112,7 @@ Iterate over the stored strings in insertion order.
 | ----------- | ------------------------------ |
 | **RETURNS** | A string in the store. ~~str~~ |
 
-## StringStore.items {#iter tag="method" new="4"}
+## StringStore.items {id="items", tag="method", version="4"}
 
 Iterate over the stored string-hash pairs in insertion order.
 
@@ -128,7 +128,7 @@ Iterate over the stored string-hash pairs in insertion order.
 | ----------- | ------------------------------------------------------ |
 | **RETURNS** | A list of string-hash pairs. ~~List[Tuple[str, int]]~~ |
 
-## StringStore.keys {#iter tag="method" new="4"}
+## StringStore.keys {id="keys", tag="method", version="4"}
 
 Iterate over the stored strings in insertion order.
 
@@ -144,7 +144,7 @@ Iterate over the stored strings in insertion order.
 | ----------- | -------------------------------- |
 | **RETURNS** | A list of strings. ~~List[str]~~ |
 
-## StringStore.values {#iter tag="method" new="4"}
+## StringStore.values {id="values", tag="method", version="4"}
 
 Iterate over the stored string hashes in insertion order.
 

From 431fd4521e84180487f1b3bb736e77079b432c58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 16 Jan 2023 10:25:53 +0100
Subject: [PATCH 360/504] Add
 `TrainablePipe.{distill,get_teacher_student_loss}` (#12016)

* Add `TrainablePipe.{distill,get_teacher_student_loss}`

This change adds two methods:

- `TrainablePipe::distill` which performs a training step of a
   student pipe on a teacher pipe, giving a batch of `Doc`s.
- `TrainablePipe::get_teacher_student_loss` computes the loss
  of a student relative to the teacher.

The `distill` or `get_teacher_student_loss` methods are also implemented
in the tagger, edit tree lemmatizer, and parser pipes, to enable
distillation in those pipes and as an example for other pipes.

* Fix stray `Beam` import

* Fix incorrect import

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* TrainablePipe.distill: use `Iterable[Example]`

* Add Pipe.is_distillable method

* Add `validate_distillation_examples`

This first calls `validate_examples` and then checks that the
student/teacher tokens are the same.

* Update distill documentation

* Add distill documentation for all pipes that support distillation

* Fix incorrect identifier

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Add comment to explain `is_distillable`

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/errors.py                         |  3 ++
 spacy/pipeline/edit_tree_lemmatizer.py  | 19 ++++++++++
 spacy/pipeline/tagger.pyx               |  4 +--
 spacy/pipeline/trainable_pipe.pyx       | 24 ++++++-------
 spacy/pipeline/transition_parser.pyx    | 44 ++++++++++++++---------
 spacy/tests/parser/test_ner.py          |  5 +--
 spacy/tests/parser/test_parse.py        |  6 +---
 spacy/tests/pipeline/test_tagger.py     | 46 +++++++++++++++++++++++++
 spacy/tests/training/test_training.py   | 15 +++-----
 spacy/training/__init__.py              |  3 ++
 spacy/training/example.pyx              | 10 ++----
 website/docs/api/dependencyparser.mdx   | 18 +++++-----
 website/docs/api/edittreelemmatizer.mdx | 18 +++++-----
 website/docs/api/entityrecognizer.mdx   | 18 +++++-----
 website/docs/api/morphologizer.mdx      | 18 +++++-----
 website/docs/api/pipe.mdx               | 18 +++++-----
 website/docs/api/sentencerecognizer.mdx | 18 +++++-----
 website/docs/api/tagger.mdx             | 18 +++++-----
 18 files changed, 183 insertions(+), 122 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 079f2efb7ca..f42682d2ca3 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -984,6 +984,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E4000 = ("Expected a Doc as input, but got: '{type}'")
     E4001 = ("Expected input to be one of the following types: ({expected_types}), "
              "but got '{received_type}'")
+    E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
+    E4003 = ("Training examples for distillation must have the exact same tokens in the "
+             "reference and predicted docs.")
 
 
 # fmt: on
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 95ffecc6ae8..8637dc077fa 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -163,6 +163,25 @@ def get_loss(
 
         return float(loss), d_scores
 
+    def get_teacher_student_loss(
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+    ) -> Tuple[float, List[Floats2d]]:
+        """Calculate the loss and its gradient for a batch of student
+        scores, relative to teacher scores.
+
+        teacher_scores: Scores representing the teacher model's predictions.
+        student_scores: Scores representing the student model's predictions.
+
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+        
+        DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
+        """
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
+        return float(loss), d_scores
+
     def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         n_docs = len(list(docs))
         if not any(len(doc) for doc in docs):
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index c641ccd6f32..cc43caa72c8 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,5 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Tuple
 import numpy
 import srsly
 from thinc.api import Model, set_dropout_rate, Config
@@ -273,7 +274,6 @@ class Tagger(TrainablePipe):
 
         DOCS: https://spacy.io/api/tagger#rehearse
         """
-        loss_func = LegacySequenceCategoricalCrossentropy()
         if losses is None:
             losses = {}
         losses.setdefault(self.name, 0.0)
@@ -307,7 +307,7 @@ class Tagger(TrainablePipe):
 
         DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
         """
-        loss_func = SequenceCategoricalCrossentropy(normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index b9c297990f9..ff56357807e 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -7,7 +7,7 @@ import warnings
 
 from ..tokens.doc cimport Doc
 
-from ..training import validate_examples
+from ..training import validate_examples, validate_distillation_examples
 from ..errors import Errors, Warnings
 from .pipe import Pipe, deserialize_config
 from .. import util
@@ -59,14 +59,14 @@ cdef class TrainablePipe(Pipe):
         except Exception as e:
             error_handler(self.name, self, [doc], e)
 
+
     def distill(self,
-                teacher_pipe: Optional["TrainablePipe"],
-                examples: Iterable["Example"],
-                *,
-                drop: float = 0.0,
-                sgd: Optional[Optimizer] = None,
-                losses: Optional[Dict[str, float]] = None
-                ) -> Dict[str, float]:
+               teacher_pipe: Optional["TrainablePipe"],
+               examples: Iterable["Example"],
+               *,
+               drop: float=0.0,
+               sgd: Optional[Optimizer]=None,
+               losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
         """Train a pipe (the student) on the predictions of another pipe
         (the teacher). The student is typically trained on the probability
         distribution of the teacher, but details may differ per pipe.
@@ -74,15 +74,15 @@ cdef class TrainablePipe(Pipe):
         teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
             from.
         examples (Iterable[Example]): Distillation examples. The reference
-            (teacher) and predicted (student) docs must have the same number of
-            tokens and the same orthography.
+            and predicted docs must have the same number of tokens and the
+            same orthography.
         drop (float): dropout rate.
         sgd (Optional[Optimizer]): An optimizer. Will be created via
             create_optimizer if not set.
         losses (Optional[Dict[str, float]]): Optional record of loss during
             distillation.
         RETURNS: The updated losses dictionary.
-        
+
         DOCS: https://spacy.io/api/pipe#distill
         """
         # By default we require a teacher pipe, but there are downstream
@@ -227,7 +227,7 @@ cdef class TrainablePipe(Pipe):
         student_scores: Scores representing the student model's predictions.
 
         RETURNS (Tuple[float, float]): The loss and the gradient.
-        
+
         DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
         """
         raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 9c3543101bc..e0449f188b5 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,9 +1,8 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
-
 from typing import Dict, Iterable, List, Optional, Tuple
-
+from cymem.cymem cimport Pool
 cimport numpy as np
 from cymem.cymem cimport Pool
 
@@ -16,7 +15,10 @@ from libcpp.vector cimport vector
 import random
 
 import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
+from thinc.api import chain, softmax_activation, use_ops
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d
 import numpy.random
 import numpy
 import numpy.random
@@ -47,6 +49,12 @@ from .trainable_pipe import TrainablePipe
 
 from ._parser_internals cimport _beam_utils
 
+from ..training import validate_examples, validate_get_examples
+from ..training import validate_distillation_examples
+from ..errors import Errors, Warnings
+from .. import util
+from ..errors import Errors
+from ..training import validate_examples, validate_get_examples
 from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
@@ -244,13 +252,12 @@ cdef class Parser(TrainablePipe):
         raise NotImplementedError
 
     def distill(self,
-                teacher_pipe: Optional[TrainablePipe],
-                examples: Iterable["Example"],
-                *,
-                drop: float = 0.0,
-                sgd: Optional[Optimizer] = None,
-                losses: Optional[Dict[str, float]] = None
-                ):
+               teacher_pipe: Optional[TrainablePipe],
+               examples: Iterable["Example"],
+               *,
+               drop: float=0.0,
+               sgd: Optional[Optimizer]=None,
+               losses: Optional[Dict[str, float]]=None):
         """Train a pipe (the student) on the predictions of another pipe
         (the teacher). The student is trained on the transition probabilities
         of the teacher.
@@ -258,15 +265,15 @@ cdef class Parser(TrainablePipe):
         teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
             from.
         examples (Iterable[Example]): Distillation examples. The reference
-            (teacher) and predicted (student) docs must have the same number of
-            tokens and the same orthography.
+            and predicted docs must have the same number of tokens and the
+            same orthography.
         drop (float): dropout rate.
         sgd (Optional[Optimizer]): An optimizer. Will be created via
             create_optimizer if not set.
         losses (Optional[Dict[str, float]]): Optional record of loss during
             distillation.
         RETURNS: The updated losses dictionary.
-        
+
         DOCS: https://spacy.io/api/dependencyparser#distill
         """
         if teacher_pipe is None:
@@ -296,7 +303,7 @@ cdef class Parser(TrainablePipe):
             # batch uniform length. Since we do not have a gold standard
             # sequence, we use the teacher's predictions as the gold
             # standard.
-            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
             states = self._init_batch(teacher_step_model, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
@@ -349,10 +356,10 @@ cdef class Parser(TrainablePipe):
         student_scores: Scores representing the student model's predictions.
 
         RETURNS (Tuple[float, float]): The loss and the gradient.
-        
+
         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
         """
-        loss_func = SequenceCategoricalCrossentropy(normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))
@@ -782,7 +789,10 @@ cdef class Parser(TrainablePipe):
         long_doc[:N], and another representing long_doc[N:]. In contrast to
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
-        cdef StateClass state
+        cdef:
+            StateClass start_state
+            StateClass state
+            Transition action
         all_states = self.moves.init_batch(docs)
         states = []
         to_cut = []
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 7c3a9d56249..b6848d380fe 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -624,9 +624,7 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-@pytest.mark.slow
-@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
-def test_distill(max_moves):
+def test_distill():
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -644,7 +642,6 @@ def test_distill(max_moves):
 
     student = English()
     student_ner = student.add_pipe("ner")
-    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index f63d56f6922..42cf5ced998 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -402,10 +402,7 @@ def test_is_distillable():
     assert parser.is_distillable
 
 
-@pytest.mark.slow
-@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
-def test_distill(max_moves):
-    fix_random_seed(0)
+def test_distill():
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []
@@ -423,7 +420,6 @@ def test_distill(max_moves):
 
     student = English()
     student_parser = student.add_pipe("parser")
-    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
     student_parser.initialize(
         get_examples=lambda: train_examples, labels=teacher_parser.label_data
     )
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 50cc828a038..b6f94f7f97b 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -239,6 +239,52 @@ def test_overfitting_IO():
     assert doc3[0].tag_ != "N"
 
 
+def test_is_distillable():
+    nlp = English()
+    tagger = nlp.add_pipe("tagger")
+    assert tagger.is_distillable
+
+
+def test_distill():
+    teacher = English()
+    teacher_tagger = teacher.add_pipe("tagger")
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
+
+    optimizer = teacher.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        teacher.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["tagger"] < 0.00001
+
+    student = English()
+    student_tagger = student.add_pipe("tagger")
+    student_tagger.min_tree_freq = 1
+    student_tagger.initialize(
+        get_examples=lambda: train_examples, labels=teacher_tagger.label_data
+    )
+
+    distill_examples = [
+        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
+    ]
+
+    for i in range(50):
+        losses = {}
+        student_tagger.distill(
+            teacher_tagger, distill_examples, sgd=optimizer, losses=losses
+        )
+    assert losses["tagger"] < 0.00001
+
+    test_text = "I like blue eggs"
+    doc = student(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+
 def test_save_activations():
     # Test if activations are correctly added to Doc when requested.
     nlp = English()
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index e8a19947606..ef20ec365c6 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -8,17 +8,10 @@
 import spacy
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
-from spacy.training import (
-    Alignment,
-    Corpus,
-    Example,
-    biluo_tags_to_offsets,
-    biluo_tags_to_spans,
-    docs_to_json,
-    iob_to_biluo,
-    offsets_to_biluo_tags,
-    validate_distillation_examples,
-)
+from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
+from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
+from spacy.training import offsets_to_biluo_tags, validate_distillation_examples
+from spacy.training.alignment_array import AlignmentArray
 from spacy.training.align import get_alignments
 from spacy.training.alignment_array import AlignmentArray
 from spacy.training.converters import json_to_docs
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index adfc2bb6658..9445d0b63a5 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,3 +1,6 @@
+from .corpus import Corpus, JsonlCorpus  # noqa: F401
+from .example import Example, validate_examples, validate_get_examples  # noqa: F401
+from .example import validate_distillation_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index daa6ca3f468..d121c9aa56f 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -57,12 +57,6 @@ def validate_examples(examples, method):
 
 
 def validate_distillation_examples(examples, method):
-    """Check that a batch of examples received during processing is valid
-    for distillation.
-
-    examples (Iterable[Examples]): A batch of examples.
-    method (str): The method name to show in error messages.
-    """
     validate_examples(examples, method)
     for eg in examples:
         if [token.text for token in eg.reference] != [token.text for token in eg.predicted]:
@@ -278,7 +272,7 @@ cdef class Example:
         heads = numpy.asarray(heads, dtype='i')
         gold_head_i = heads[gold_i]
 
-        # Select all gold tokens that are heads of the previously selected 
+        # Select all gold tokens that are heads of the previously selected
         # gold tokens (and are aligned to a single candidate token).
         g2c_len_heads = gold_to_cand.lengths[gold_head_i]
         g2c_len_heads = numpy.where(g2c_len_heads == 1)[0]
@@ -402,7 +396,7 @@ cdef class Example:
         span_dict = {}
         for key in self.reference.spans:
             span_tuples = []
-            for span in self.reference.spans[key]: 
+            for span in self.reference.spans[key]:
                 span_tuple = (span.start_char, span.end_char, span.label_, span.kb_id_)
                 span_tuples.append(span_tuple)
             span_dict[key] = span_tuples
diff --git a/website/docs/api/dependencyparser.mdx b/website/docs/api/dependencyparser.mdx
index 296d6d87da5..5179ce48b84 100644
--- a/website/docs/api/dependencyparser.mdx
+++ b/website/docs/api/dependencyparser.mdx
@@ -154,15 +154,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
-| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
 
 ## DependencyParser.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx
index c8b5c71806b..2e099365758 100644
--- a/website/docs/api/edittreelemmatizer.mdx
+++ b/website/docs/api/edittreelemmatizer.mdx
@@ -138,15 +138,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
-| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
 
 ## EditTreeLemmatizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/entityrecognizer.mdx b/website/docs/api/entityrecognizer.mdx
index f503cc998b0..005d5d11deb 100644
--- a/website/docs/api/entityrecognizer.mdx
+++ b/website/docs/api/entityrecognizer.mdx
@@ -150,15 +150,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
-| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
 
 ## EntityRecognizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 4660ec312fa..4f79458d319 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -144,15 +144,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
-| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
 
 ## Morphologizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx
index e1e7f5d7021..120c8f6908f 100644
--- a/website/docs/api/pipe.mdx
+++ b/website/docs/api/pipe.mdx
@@ -257,15 +257,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
-| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
 
 ## TrainablePipe.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx
index dfb7ed308ba..02fd57102e2 100644
--- a/website/docs/api/sentencerecognizer.mdx
+++ b/website/docs/api/sentencerecognizer.mdx
@@ -129,15 +129,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
-| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
 
 ## SentenceRecognizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index 35e7a23b174..664fd7940c1 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -128,15 +128,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
-| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+| Name           | Description                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
 
 ## Tagger.pipe {id="pipe",tag="method"}
 

From a347b2e41b1116358f98a92bae2439aa9a848433 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 18 Jan 2023 11:27:45 +0100
Subject: [PATCH 361/504] Merge the parser refactor into `v4` (#10940)

* Try to fix doc.copy

* Set dev version

* Make vocab always own lexemes

* Change version

* Add SpanGroups.copy method

* Fix set_annotations during Parser.update

* Fix dict proxy copy

* Upd version

* Fix copying SpanGroups

* Fix set_annotations in parser.update

* Fix parser set_annotations during update

* Revert "Fix parser set_annotations during update"

This reverts commit eb138c89edb306608826dca50619ea8a60de2b14.

* Revert "Fix set_annotations in parser.update"

This reverts commit c6df0eafd0046179c1c9fb7840074edf04e4721d.

* Fix set_annotations during parser update

* Inc version

* Handle final states in get_oracle_sequence

* Inc version

* Try to fix parser training

* Inc version

* Fix

* Inc version

* Fix parser oracle

* Inc version

* Inc version

* Fix transition has_gold

* Inc version

* Try to use real histories, not oracle

* Inc version

* Upd parser

* Inc version

* WIP on rewrite parser

* WIP refactor parser

* New progress on parser model refactor

* Prepare to remove parser_model.pyx

* Convert parser from cdef class

* Delete spacy.ml.parser_model

* Delete _precomputable_affine module

* Wire up tb_framework to new parser model

* Wire up parser model

* Uncython ner.pyx and dep_parser.pyx

* Uncython

* Work on parser model

* Support unseen_classes in parser model

* Support unseen classes in parser

* Cleaner handling of unseen classes

* Work through tests

* Keep working through errors

* Keep working through errors

* Work on parser. 15 tests failing

* Xfail beam stuff. 9 failures

* More xfail. 7 failures

* Xfail. 6 failures

* cleanup

* formatting

* fixes

* pass nO through

* Fix empty doc in update

* Hackishly fix resizing. 3 failures

* Fix redundant test. 2 failures

* Add reference version

* black formatting

* Get tests passing with reference implementation

* Fix missing prints

* Add missing file

* Improve indexing on reference implementation

* Get non-reference forward func working

* Start rigging beam back up

* removing redundant tests, cf #8106

* black formatting

* temporarily xfailing issue 4314

* make flake8 happy again

* mypy fixes

* ensure labels are added upon predict

* cleanup remnants from merge conflicts

* Improve unseen label masking

Two changes to speed up masking by ~10%:

- Use a bool array rather than an array of float32.

- Let the mask indicate whether a label was seen, rather than
  unseen. The mask is most frequently used to index scores for
  seen labels. However, since the mask marked unseen labels,
  this required computing an intermittent flipped mask.

* Write moves costs directly into numpy array (#10163)

This avoids elementwise indexing and the allocation of an additional
array.

Gives a ~15% speed improvement when using batch_by_sequence with size
32.

* Temporarily disable ner and rehearse tests

Until rehearse is implemented again in the refactored parser.

* Fix loss serialization issue (#10600)

* Fix loss serialization issue

Serialization of a model fails with:

TypeError: array(738.3855, dtype=float32) is not JSON serializable

Fix this using float conversion.

* Disable CI steps that require spacy.TransitionBasedParser.v2

After finishing the refactor, TransitionBasedParser.v2 should be
provided for backwards compat.

* Add back support for beam parsing to the refactored parser (#10633)

* Add back support for beam parsing

Beam parsing was already implemented as part of the `BeamBatch` class.
This change makes its counterpart `GreedyBatch`. Both classes are hooked
up in `TransitionModel`, selecting `GreedyBatch` when the beam size is
one, or `BeamBatch` otherwise.

* Use kwarg for beam width

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Avoid implicit default for beam_width and beam_density

* Parser.{beam,greedy}_parse: ensure labels are added

* Remove 'deprecated' comments

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Parser `StateC` optimizations (#10746)

* `StateC`: Optimizations

Avoid GIL acquisition in `__init__`
Increase default buffer capacities on init
Reduce C++ exception overhead

* Fix typo

* Replace `set::count` with `set::find`

* Add exception attribute to c'tor

* Remove unused import

* Use a power-of-two value for initial capacity
Use default-insert to init `_heads` and `_unshiftable`

* Merge `cdef` variable declarations and assignments

* Vectorize `example.get_aligned_parses` (#10789)

* `example`: Vectorize `get_aligned_parse`
Rename `numpy` import

* Convert aligned array to lists before returning

* Revert import renaming

* Elide slice arguments when selecting the entire range

* Tagger/morphologizer alignment performance optimizations (#10798)

* `example`: Unwrap `numpy` scalar arrays before passing them to `StringStore.__getitem__`

* `AlignmentArray`: Use native list as staging buffer for offset calculation

* `example`: Vectorize `get_aligned`

* Hoist inner functions out of `get_aligned`

* Replace inline `if..else` clause in assignment statement

* `AlignmentArray`: Use raw indexing into offset and data `numpy` arrays

* `example`: Replace array unique value check with `groupby`

* `example`: Correctly exclude tokens with no alignment in `_get_aligned_vectorized`
Simplify `_get_aligned_non_vectorized`

* `util`: Update `all_equal` docstring

* Explicitly use `int32_t*`

* Restore C CPU inference in the refactored parser (#10747)

* Bring back the C parsing model

The C parsing model is used for CPU inference and is still faster for
CPU inference than the forward pass of the Thinc model.

* Use C sgemm provided by the Ops implementation

* Make tb_framework module Cython, merge in C forward implementation

* TransitionModel: raise in backprop returned from forward_cpu

* Re-enable greedy parse test

* Return transition scores when forward_cpu is used

* Apply suggestions from code review

Import `Model` from `thinc.api`

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Use relative imports in tb_framework

* Don't assume a default for beam_width

* We don't have a direct dependency on BLIS anymore

* Rename forwards to _forward_{fallback,greedy_cpu}

* Require thinc >=8.1.0,<8.2.0

* tb_framework: clean up imports

* Fix return type of _get_seen_mask

* Move up _forward_greedy_cpu

* Style fixes.

* Lower thinc lowerbound to 8.1.0.dev0

* Formatting fix

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Reimplement parser rehearsal function (#10878)

* Reimplement parser rehearsal function

Before the parser refactor, rehearsal was driven by a loop in the
`rehearse` method itself. For each parsing step, the loops would:

1. Get the predictions of the teacher.
2. Get the predictions and backprop function of the student.
3. Compute the loss and backprop into the student.
4. Move the teacher and student forward with the predictions of
   the student.

In the refactored parser, we cannot perform search stepwise rehearsal
anymore, since the model now predicts all parsing steps at once.
Therefore, rehearsal is performed in the following steps:

1. Get the predictions of all parsing steps from the student, along
   with its backprop function.
2. Get the predictions from the teacher, but use the predictions of
   the student to advance the parser while doing so.
3. Compute the loss and backprop into the student.

To support the second step a new method, `advance_with_actions` is
added to `GreedyBatch`, which performs the provided parsing steps.

* tb_framework: wrap upper_W and upper_b in Linear

Thinc's Optimizer cannot handle resizing of existing parameters. Until
it does, we work around this by wrapping the weights/biases of the upper
layer of the parser model in Linear. When the upper layer is resized, we
copy over the existing parameters into a new Linear instance. This does
not trigger an error in Optimizer, because it sees the resized layer as
a new set of parameters.

* Add test for TransitionSystem.apply_actions

* Better FIXME marker

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Fixes from Madeesh

* Apply suggestions from Sofie

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remove useless assignment

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename some identifiers in the parser refactor (#10935)

* Rename _parseC to _parse_batch

* tb_framework: prefix many auxiliary functions with underscore

To clearly state the intent that they are private.

* Rename `lower` to `hidden`, `upper` to `output`

* Parser slow test fixup

We don't have TransitionBasedParser.{v1,v2} until we bring it back as a
legacy option.

* Remove last vestiges of PrecomputableAffine

This does not exist anymore as a separate layer.

* ner: re-enable sentence boundary checks

* Re-enable test that works now.

* test_ner: make loss test more strict again

* Remove commented line

* Re-enable some more beam parser tests

* Remove unused _forward_reference function

* Update for CBlas changes in Thinc 8.1.0.dev2

Bump thinc dependency to 8.1.0.dev3.

* Remove references to spacy.TransitionBasedParser.{v1,v2}

Since they will not be offered starting with spaCy v4.

* `tb_framework`: Replace references to `thinc.backends.linalg` with `CBlas`

* dont use get_array_module (#11056) (#11293)

Co-authored-by: kadarakos <kadar.akos@gmail.com>

* Move `thinc.extra.search` to `spacy.pipeline._parser_internals` (#11317)

* `search`: Move from `thinc.extra.search`
Fix NPE in `Beam.__dealloc__`

* `pytest`: Add support for executing Cython tests
Move `search` tests from thinc and patch them to run with `pytest`

* `mypy` fix

* Update comment

* `conftest`: Expose `register_cython_tests`

* Remove unused import

* Move `argmax` impls to new `_parser_utils` Cython module (#11410)

* Parser does not have to be a cdef class anymore

This also fixes validation of the initialization schema.

* Add back spacy.TransitionBasedParser.v2

* Fix a rename that was missed in #10878.

So that rehearsal tests pass.

* Remove module from setup.py that got added during the merge

* Bring back support for `update_with_oracle_cut_size` (#12086)

* Bring back support for `update_with_oracle_cut_size`

This option was available in the pre-refactor parser, but was never
implemented in the refactored parser. This option cuts transition
sequences that are longer than `update_with_oracle_cut` size into
separate sequences that have at most `update_with_oracle_cut`
transitions. The oracle (gold standard) transition sequence is used to
determine the cuts and the initial states for the additional sequences.

Applying this cut makes the batches more homogeneous in the transition
sequence lengths, making forward passes (and as a consequence training)
much faster.

Training time 1000 steps on de_core_news_lg:

- Before this change: 149s
- After this change: 68s
- Pre-refactor parser: 81s

* Fix a rename that was missed in #10878.

So that rehearsal tests pass.

* Apply suggestions from @shadeMe

* Use chained conditional

* Test with update_with_oracle_cut_size={0, 1, 5, 100}

And fix a git that occurs with a cut size of 1.

* Fix up some merge fall out

* Update parser distillation for the refactor

In the old parser, we'd iterate over the transitions in the distill
function and compute the loss/gradients on the go. In the refactored
parser, we first let the student model parse the inputs. Then we'll let
the teacher compute the transition probabilities of the states in the
student's transition sequence. We can then compute the gradients of the
student given the teacher.

* Add back spacy.TransitionBasedParser.v1 references

- Accordion in the architecture docs.
- Test in test_parse, but disabled until we have a spacy-legacy release.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: kadarakos <kadar.akos@gmail.com>
---
 setup.py                                      |   6 +-
 spacy/cli/templates/quickstart_training.jinja |  12 +-
 spacy/errors.py                               |   6 +-
 spacy/ml/_precomputable_affine.py             | 164 -----
 spacy/ml/models/parser.py                     | 175 ++---
 spacy/ml/parser_model.pxd                     |  49 --
 spacy/ml/parser_model.pyx                     | 497 --------------
 spacy/ml/tb_framework.pxd                     |  28 +
 spacy/ml/tb_framework.py                      |  51 --
 spacy/ml/tb_framework.pyx                     | 621 ++++++++++++++++++
 .../_parser_internals/_beam_utils.pyx         |   3 +-
 .../_parser_internals/_parser_utils.pxd       |   2 +
 .../_parser_internals/_parser_utils.pyx       |  22 +
 spacy/pipeline/_parser_internals/_state.pxd   |  59 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 +
 spacy/pipeline/_parser_internals/batch.pxd    |   2 +
 spacy/pipeline/_parser_internals/batch.pyx    |  52 ++
 spacy/pipeline/_parser_internals/ner.pyx      |   2 +
 .../pipeline/_parser_internals/stateclass.pyx |   7 +
 .../_parser_internals/transition_system.pxd   |   7 +
 .../_parser_internals/transition_system.pyx   |  71 ++
 .../{dep_parser.pyx => dep_parser.py}         |  17 +-
 spacy/pipeline/{ner.pyx => ner.py}            |  33 +-
 spacy/pipeline/transition_parser.pxd          |  21 -
 spacy/pipeline/transition_parser.pyx          | 507 ++++++--------
 spacy/tests/parser/test_ner.py                |  11 +-
 spacy/tests/parser/test_parse.py              |  80 ++-
 spacy/tests/pipeline/test_tok2vec.py          |   2 +-
 .../tests/serialize/test_serialize_config.py  |  56 +-
 spacy/tests/test_misc.py                      |  53 +-
 spacy/training/example.pyx                    |   1 -
 website/docs/api/architectures.mdx            |  26 +-
 website/docs/api/cli.mdx                      |   8 +-
 website/docs/api/legacy.mdx                   |   2 +-
 .../docs/usage/embeddings-transformers.mdx    |   6 +-
 35 files changed, 1294 insertions(+), 1368 deletions(-)
 delete mode 100644 spacy/ml/_precomputable_affine.py
 delete mode 100644 spacy/ml/parser_model.pxd
 delete mode 100644 spacy/ml/parser_model.pyx
 create mode 100644 spacy/ml/tb_framework.pxd
 delete mode 100644 spacy/ml/tb_framework.py
 create mode 100644 spacy/ml/tb_framework.pyx
 create mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pxd
 create mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pyx
 create mode 100644 spacy/pipeline/_parser_internals/batch.pxd
 create mode 100644 spacy/pipeline/_parser_internals/batch.pyx
 rename spacy/pipeline/{dep_parser.pyx => dep_parser.py} (97%)
 rename spacy/pipeline/{ner.pyx => ner.py} (93%)
 delete mode 100644 spacy/pipeline/transition_parser.pxd

diff --git a/setup.py b/setup.py
index 0eb529c2098..a4a87d68aec 100755
--- a/setup.py
+++ b/setup.py
@@ -32,12 +32,10 @@
     "spacy.kb.candidate",
     "spacy.kb.kb",
     "spacy.kb.kb_in_memory",
-    "spacy.ml.parser_model",
+    "spacy.ml.tb_framework",
     "spacy.morphology",
-    "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
-    "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
     "spacy.pipeline.sentencizer",
@@ -45,6 +43,7 @@
     "spacy.pipeline.tagger",
     "spacy.pipeline.transition_parser",
     "spacy.pipeline._parser_internals.arc_eager",
+    "spacy.pipeline._parser_internals.batch",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
     "spacy.pipeline._parser_internals.search",
@@ -52,6 +51,7 @@
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.pipeline._parser_internals._beam_utils",
+    "spacy.pipeline._parser_internals._parser_utils",
     "spacy.tokenizer",
     "spacy.training.align",
     "spacy.training.gold_io",
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 2817147f3e9..36325711d4d 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -90,12 +90,11 @@ grad_factor = 1.0
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = false
 nO = null
 
 [components.parser.model.tok2vec]
@@ -111,12 +110,11 @@ grad_factor = 1.0
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = false
 nO = null
 
 [components.ner.model.tok2vec]
@@ -387,12 +385,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = true
 nO = null
 
 [components.parser.model.tok2vec]
@@ -405,12 +402,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
diff --git a/spacy/errors.py b/spacy/errors.py
index f42682d2ca3..245c89aa582 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -217,10 +217,7 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
-    # v4 warning strings
-    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
-            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
-            "to return `True` in `.supports_prior_probs`.")
+    W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
 
 
 class Errors(metaclass=ErrorsWithCodes):
@@ -987,6 +984,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
     E4003 = ("Training examples for distillation must have the exact same tokens in the "
              "reference and predicted docs.")
+    E4004 = ("Backprop is not supported when is_train is not set.")
 
 
 # fmt: on
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
deleted file mode 100644
index 1c20c622b2c..00000000000
--- a/spacy/ml/_precomputable_affine.py
+++ /dev/null
@@ -1,164 +0,0 @@
-from thinc.api import Model, normal_init
-
-from ..util import registry
-
-
-@registry.layers("spacy.PrecomputableAffine.v1")
-def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
-    model = Model(
-        "precomputable_affine",
-        forward,
-        init=init,
-        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
-        params={"W": None, "b": None, "pad": None},
-        attrs={"dropout_rate": dropout},
-    )
-    return model
-
-
-def forward(model, X, is_train):
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.get_param("W")
-    # Preallocate array for layer output, including padding.
-    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
-    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
-    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
-
-    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
-    # change its shape to (nF, nO, nP) without breaking existing models. So
-    # we'll squeeze the first dimension here.
-    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
-
-    def backward(dY_ids):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nO, nP), and get back:
-        # (nB, nO, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nO, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        assert dY.ndim == 3
-        assert dY.shape[1] == nO, dY.shape
-        assert dY.shape[2] == nP, dY.shape
-        # nB = dY.shape[0]
-        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
-        Xf = X[ids]
-        Xf = Xf.reshape((Xf.shape[0], nF * nI))
-
-        model.inc_grad("b", dY.sum(axis=0))
-        dY = dY.reshape((dY.shape[0], nO * nP))
-
-        Wopfi = W.transpose((1, 2, 0, 3))
-        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
-        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
-
-        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
-        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
-        # (o, p, f, i) --> (f, o, p, i)
-        dWopfi = dWopfi.transpose((2, 0, 1, 3))
-        model.inc_grad("W", dWopfi)
-        return dXf.reshape((dXf.shape[0], nF, nI))
-
-    return Yf, backward
-
-
-def _backprop_precomputable_affine_padding(model, dY, ids):
-    nB = dY.shape[0]
-    nF = model.get_dim("nF")
-    nP = model.get_dim("nP")
-    nO = model.get_dim("nO")
-    # Backprop the "padding", used as a filler for missing values.
-    # Values that are missing are set to -1, and each state vector could
-    # have multiple missing values. The padding has different values for
-    # different missing features. The gradient of the padding vector is:
-    #
-    # for b in range(nB):
-    #     for f in range(nF):
-    #         if ids[b, f] < 0:
-    #             d_pad[f] += dY[b]
-    #
-    # Which can be rewritten as:
-    #
-    # (ids < 0).T @ dY
-    mask = model.ops.asarray(ids < 0, dtype="f")
-    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
-    return d_pad.reshape((1, nF, nO, nP))
-
-
-def init(model, X=None, Y=None):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    if model.has_param("W") and model.get_param("W").any():
-        return
-
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nO, nP, nI)
-    b = model.ops.alloc2f(nO, nP)
-    pad = model.ops.alloc4f(1, nF, nO, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
-        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors = vectors.reshape((vectors.shape[0], nO, nP))
-        vectors += b
-        vectors = model.ops.asarray(vectors)
-        if nP >= 2:
-            return model.ops.maxout(vectors)[0]
-        else:
-            return vectors * (vectors >= 0)
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = model.get_param("W").copy()
-    b = model.get_param("b").copy()
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("b", b)
-        else:
-            break
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index e776174f6ed..59483839206 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,17 +1,22 @@
-from typing import List, Literal, Optional
-
-from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
+from typing import Optional, List, Tuple, Any
 from thinc.types import Floats2d
+from thinc.api import Model
+import warnings
 
+from ...errors import Errors, Warnings
+from ...compat import Literal
 from ...errors import Errors
 from ...tokens import Doc
 from ...util import registry
-from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
+from ...tokens.doc import Doc
 
+TransitionSystem = Any  # TODO
+State = Any  # TODO
 
-@registry.architectures("spacy.TransitionBasedParser.v2")
-def build_tb_parser_model(
+
+@registry.architectures.register("spacy.TransitionBasedParser.v2")
+def transition_parser_v2(
     tok2vec: Model[List[Doc], List[Floats2d]],
     state_type: Literal["parser", "ner"],
     extra_state_tokens: bool,
@@ -19,6 +24,46 @@ def build_tb_parser_model(
     maxout_pieces: int,
     use_upper: bool,
     nO: Optional[int] = None,
+) -> Model:
+    if not use_upper:
+        warnings.warn(Warnings.W400)
+
+    return build_tb_parser_model(
+        tok2vec,
+        state_type,
+        extra_state_tokens,
+        hidden_width,
+        maxout_pieces,
+        nO=nO,
+    )
+
+
+@registry.architectures.register("spacy.TransitionBasedParser.v3")
+def transition_parser_v3(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    state_type: Literal["parser", "ner"],
+    extra_state_tokens: bool,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+) -> Model:
+    return build_tb_parser_model(
+        tok2vec,
+        state_type,
+        extra_state_tokens,
+        hidden_width,
+        maxout_pieces,
+        nO=nO,
+    )
+
+
+def build_tb_parser_model(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    state_type: Literal["parser", "ner"],
+    extra_state_tokens: bool,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
 ) -> Model:
     """
     Build a transition-based parser model. Can apply to NER or dependency-parsing.
@@ -51,14 +96,7 @@ def build_tb_parser_model(
         feature sets (for the NER) or 13 (for the parser).
     hidden_width (int): The width of the hidden layer.
     maxout_pieces (int): How many pieces to use in the state prediction layer.
-        Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
-        is replaced with a ReLu non-linearity if use_upper=True, and no
-        non-linearity if use_upper=False.
-    use_upper (bool): Whether to use an additional hidden layer after the state
-        vector in order to predict the action scores. It is recommended to set
-        this to False for large pretrained models such as transformers, and True
-        for smaller networks. The upper layer is computed on CPU, which becomes
-        a bottleneck on larger GPU-based models, where it's also less necessary.
+        Recommended values are 1, 2 or 3.
     nO (int or None): The number of actions the model will predict between.
         Usually inferred from data at the beginning of training, or loaded from
         disk.
@@ -69,106 +107,11 @@ def build_tb_parser_model(
         nr_feature_tokens = 6 if extra_state_tokens else 3
     else:
         raise ValueError(Errors.E917.format(value=state_type))
-    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec = chain(
-        tok2vec,
-        list2array(),
-        Linear(hidden_width, t2v_width),
+    return TransitionModel(
+        tok2vec=tok2vec,
+        state_tokens=nr_feature_tokens,
+        hidden_width=hidden_width,
+        maxout_pieces=maxout_pieces,
+        nO=nO,
+        unseen_classes=set(),
     )
-    tok2vec.set_dim("nO", hidden_width)
-    lower = _define_lower(
-        nO=hidden_width if use_upper else nO,
-        nF=nr_feature_tokens,
-        nI=tok2vec.get_dim("nO"),
-        nP=maxout_pieces,
-    )
-    upper = None
-    if use_upper:
-        with use_ops("cpu"):
-            # Initialize weights at zero, as it's a classification layer.
-            upper = _define_upper(nO=nO, nI=None)
-    return TransitionModel(tok2vec, lower, upper, resize_output)
-
-
-def _define_upper(nO, nI):
-    return Linear(nO=nO, nI=nI, init_W=zero_init)
-
-
-def _define_lower(nO, nF, nI, nP):
-    return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
-
-
-def resize_output(model, new_nO):
-    if model.attrs["has_upper"]:
-        return _resize_upper(model, new_nO)
-    return _resize_lower(model, new_nO)
-
-
-def _resize_upper(model, new_nO):
-    upper = model.get_ref("upper")
-    if upper.has_dim("nO") is None:
-        upper.set_dim("nO", new_nO)
-        return model
-    elif new_nO == upper.get_dim("nO"):
-        return model
-
-    smaller = upper
-    nI = smaller.maybe_get_dim("nI")
-    with use_ops("cpu"):
-        larger = _define_upper(nO=new_nO, nI=nI)
-    # it could be that the model is not initialized yet, then skip this bit
-    if smaller.has_param("W"):
-        larger_W = larger.ops.alloc2f(new_nO, nI)
-        larger_b = larger.ops.alloc1f(new_nO)
-        smaller_W = smaller.get_param("W")
-        smaller_b = smaller.get_param("b")
-        # Weights are stored in (nr_out, nr_in) format, so we're basically
-        # just adding rows here.
-        if smaller.has_dim("nO"):
-            old_nO = smaller.get_dim("nO")
-            larger_W[:old_nO] = smaller_W
-            larger_b[:old_nO] = smaller_b
-            for i in range(old_nO, new_nO):
-                model.attrs["unseen_classes"].add(i)
-
-        larger.set_param("W", larger_W)
-        larger.set_param("b", larger_b)
-    model._layers[-1] = larger
-    model.set_ref("upper", larger)
-    return model
-
-
-def _resize_lower(model, new_nO):
-    lower = model.get_ref("lower")
-    if lower.has_dim("nO") is None:
-        lower.set_dim("nO", new_nO)
-        return model
-
-    smaller = lower
-    nI = smaller.maybe_get_dim("nI")
-    nF = smaller.maybe_get_dim("nF")
-    nP = smaller.maybe_get_dim("nP")
-    larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
-    # it could be that the model is not initialized yet, then skip this bit
-    if smaller.has_param("W"):
-        larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
-        larger_b = larger.ops.alloc2f(new_nO, nP)
-        larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
-        smaller_W = smaller.get_param("W")
-        smaller_b = smaller.get_param("b")
-        smaller_pad = smaller.get_param("pad")
-        # Copy the old weights and padding into the new layer
-        if smaller.has_dim("nO"):
-            old_nO = smaller.get_dim("nO")
-            larger_W[:, 0:old_nO, :, :] = smaller_W
-            larger_pad[:, :, 0:old_nO, :] = smaller_pad
-            larger_b[0:old_nO, :] = smaller_b
-            for i in range(old_nO, new_nO):
-                model.attrs["unseen_classes"].add(i)
-
-        larger.set_param("W", larger_W)
-        larger.set_param("b", larger_b)
-        larger.set_param("pad", larger_pad)
-    model._layers[1] = larger
-    model.set_ref("lower", larger)
-    return model
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
deleted file mode 100644
index 88386255147..00000000000
--- a/spacy/ml/parser_model.pxd
+++ /dev/null
@@ -1,49 +0,0 @@
-from libc.string cimport memcpy, memset
-from thinc.backends.cblas cimport CBlas
-
-from ..pipeline._parser_internals._state cimport StateC
-from ..typedefs cimport hash_t, weight_t
-
-
-cdef struct SizesC:
-    int states
-    int classes
-    int hiddens
-    int pieces
-    int feats
-    int embed_width
-
-
-cdef struct WeightsC:
-    const float* feat_weights
-    const float* feat_bias
-    const float* hidden_bias
-    const float* hidden_weights
-    const float* seen_classes
-
-
-cdef struct ActivationsC:
-    int* token_ids
-    float* unmaxed
-    float* scores
-    float* hiddens
-    int* is_valid
-    int _curr_size
-    int _max_size
-
-
-cdef WeightsC get_c_weights(model) except *
-
-cdef SizesC get_c_sizes(model, int batch_size) except *
-
-cdef ActivationsC alloc_activations(SizesC n) nogil
-
-cdef void free_activations(const ActivationsC* A) nogil
-
-cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-                         const WeightsC* W, SizesC n) nogil
-
-cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
-
-cdef void cpu_log_loss(float* d_scores, const float* costs,
-                       const int* is_valid, const float* scores, int O) nogil
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
deleted file mode 100644
index 7f18ea1ba7f..00000000000
--- a/spacy/ml/parser_model.pyx
+++ /dev/null
@@ -1,497 +0,0 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
-cimport numpy as np
-from libc.math cimport exp
-from libc.stdlib cimport calloc, free, realloc
-from thinc.backends.cblas cimport saxpy, sgemm
-
-import numpy
-import numpy.random
-from thinc.api import CupyOps, Model, NumpyOps, get_ops
-
-from .. import util
-from ..errors import Errors
-
-from ..pipeline._parser_internals.stateclass cimport StateClass
-from ..typedefs cimport weight_t
-
-
-cdef WeightsC get_c_weights(model) except *:
-    cdef WeightsC output
-    cdef precompute_hiddens state2vec = model.state2vec
-    output.feat_weights = state2vec.get_feat_weights()
-    output.feat_bias = <const float*>state2vec.bias.data
-    cdef np.ndarray vec2scores_W
-    cdef np.ndarray vec2scores_b
-    if model.vec2scores is None:
-        output.hidden_weights = NULL
-        output.hidden_bias = NULL
-    else:
-        vec2scores_W = model.vec2scores.get_param("W")
-        vec2scores_b = model.vec2scores.get_param("b")
-        output.hidden_weights = <const float*>vec2scores_W.data
-        output.hidden_bias = <const float*>vec2scores_b.data
-    cdef np.ndarray class_mask = model._class_mask
-    output.seen_classes = <const float*>class_mask.data
-    return output
-
-
-cdef SizesC get_c_sizes(model, int batch_size) except *:
-    cdef SizesC output
-    output.states = batch_size
-    if model.vec2scores is None:
-        output.classes = model.state2vec.get_dim("nO")
-    else:
-        output.classes = model.vec2scores.get_dim("nO")
-    output.hiddens = model.state2vec.get_dim("nO")
-    output.pieces = model.state2vec.get_dim("nP")
-    output.feats = model.state2vec.get_dim("nF")
-    output.embed_width = model.tokvecs.shape[1]
-    return output
-
-
-cdef ActivationsC alloc_activations(SizesC n) nogil:
-    cdef ActivationsC A
-    memset(&A, 0, sizeof(A))
-    resize_activations(&A, n)
-    return A
-
-
-cdef void free_activations(const ActivationsC* A) nogil:
-    free(A.token_ids)
-    free(A.scores)
-    free(A.unmaxed)
-    free(A.hiddens)
-    free(A.is_valid)
-
-
-cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
-    if n.states <= A._max_size:
-        A._curr_size = n.states
-        return
-    if A._max_size == 0:
-        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
-        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
-        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
-        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    else:
-        A.token_ids = <int*>realloc(A.token_ids,
-                                    n.states * n.feats * sizeof(A.token_ids[0]))
-        A.scores = <float*>realloc(A.scores,
-                                   n.states * n.classes * sizeof(A.scores[0]))
-        A.unmaxed = <float*>realloc(A.unmaxed,
-                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>realloc(A.hiddens,
-                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
-        A.is_valid = <int*>realloc(A.is_valid,
-                                   n.states * n.classes * sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    A._curr_size = n.states
-
-
-cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-                         const WeightsC* W, SizesC n) nogil:
-    resize_activations(A, n)
-    for i in range(n.states):
-        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
-    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
-    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
-    sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n.states,
-                       n.feats, n.hiddens * n.pieces)
-    for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
-        for j in range(n.hiddens):
-            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = _arg_max(&A.unmaxed[index], n.pieces)
-            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
-    memset(A.scores, 0, n.states * n.classes * sizeof(float))
-    if W.hidden_weights == NULL:
-        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
-    else:
-        # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens, 1.0,
-                     <const float *>A.hiddens, n.hiddens,
-                     <const float *>W.hidden_weights, n.hiddens, 0.0,
-                     A.scores, n.classes)
-        # Add bias
-        for i in range(n.states):
-            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
-    # Set unseen classes to minimum value
-    i = 0
-    min_ = A.scores[0]
-    for i in range(1, n.states * n.classes):
-        if A.scores[i] < min_:
-            min_ = A.scores[i]
-    for i in range(n.states):
-        for j in range(n.classes):
-            if not W.seen_classes[j]:
-                A.scores[i*n.classes+j] = min_
-
-
-cdef void sum_state_features(CBlas cblas, float* output, const float* cached,
-                             const int* token_ids, int B, int F, int O) nogil:
-    cdef int idx, b, f
-    cdef const float* feature
-    padding = cached
-    cached += F * O
-    cdef int id_stride = F*O
-    cdef float one = 1.
-    for b in range(B):
-        for f in range(F):
-            if token_ids[f] < 0:
-                feature = &padding[f*O]
-            else:
-                idx = token_ids[f] * id_stride + f*O
-                feature = &cached[idx]
-            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
-        token_ids += F
-
-
-cdef void cpu_log_loss(float* d_scores, const float* costs, const int* is_valid,
-                       const float* scores, int O) nogil:
-    """Do multi-label log loss"""
-    cdef double max_, gmax, Z, gZ
-    best = arg_max_if_gold(scores, costs, is_valid, O)
-    guess = _arg_max(scores, O)
-
-    if best == -1 or guess == -1:
-        # These shouldn't happen, but if they do, we want to make sure we don't
-        # cause an OOB access.
-        return
-    Z = 1e-10
-    gZ = 1e-10
-    max_ = scores[guess]
-    gmax = scores[best]
-    for i in range(O):
-        Z += exp(scores[i] - max_)
-        if costs[i] <= costs[best]:
-            gZ += exp(scores[i] - gmax)
-    for i in range(O):
-        if costs[i] <= costs[best]:
-            d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
-        else:
-            d_scores[i] = exp(scores[i]-max_) / Z
-
-
-cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
-                         const int* is_valid, int n) nogil:
-    # Find minimum cost
-    cdef float cost = 1
-    for i in range(n):
-        if is_valid[i] and costs[i] < cost:
-            cost = costs[i]
-    # Now find best-scoring with that cost
-    cdef int best = -1
-    for i in range(n):
-        if costs[i] <= cost and is_valid[i]:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
-
-
-cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
-    cdef int best = -1
-    for i in range(n):
-        if is_valid[i] >= 1:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
-
-
-class ParserStepModel(Model):
-    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
-                 dropout=0.1):
-        Model.__init__(self, name="parser_step_model", forward=step_forward)
-        self.attrs["has_upper"] = has_upper
-        self.attrs["dropout_rate"] = dropout
-        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
-        if layers[1].get_dim("nP") >= 2:
-            activation = "maxout"
-        elif has_upper:
-            activation = None
-        else:
-            activation = "relu"
-        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
-                                            activation=activation, train=train)
-        if has_upper:
-            self.vec2scores = layers[-1]
-        else:
-            self.vec2scores = None
-        self.cuda_stream = util.get_cuda_stream(non_blocking=True)
-        self.backprops = []
-        self._class_mask = numpy.zeros((self.nO,), dtype='f')
-        self._class_mask.fill(1)
-        if unseen_classes is not None:
-            for class_ in unseen_classes:
-                self._class_mask[class_] = 0.
-
-    def clear_memory(self):
-        del self.tokvecs
-        del self.bp_tokvecs
-        del self.state2vec
-        del self.backprops
-        del self._class_mask
-
-    @property
-    def nO(self):
-        if self.attrs["has_upper"]:
-            return self.vec2scores.get_dim("nO")
-        else:
-            return self.state2vec.get_dim("nO")
-
-    def class_is_unseen(self, class_):
-        return self._class_mask[class_]
-
-    def mark_class_unseen(self, class_):
-        self._class_mask[class_] = 0
-
-    def mark_class_seen(self, class_):
-        self._class_mask[class_] = 1
-
-    def get_token_ids(self, states):
-        cdef StateClass state
-        states = [state for state in states if not state.is_final()]
-        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
-                                          dtype='i', order='C')
-        ids.fill(-1)
-        c_ids = <int*>ids.data
-        for state in states:
-            state.c.set_context_tokens(c_ids, ids.shape[1])
-            c_ids += ids.shape[1]
-        return ids
-
-    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
-        if isinstance(self.state2vec.ops, CupyOps) \
-           and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
-            # Move token_ids and d_vector to GPU, asynchronously
-            self.backprops.append((
-                util.get_async(self.cuda_stream, token_ids),
-                util.get_async(self.cuda_stream, d_vector),
-                get_d_tokvecs
-            ))
-        else:
-            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
-
-    def finish_steps(self, golds):
-        # Add a padding vector to the d_tokvecs gradient, so that missing
-        # values don't affect the real gradient.
-        d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
-        # Tells CUDA to block, so our async copies complete.
-        if self.cuda_stream is not None:
-            self.cuda_stream.synchronize()
-        for ids, d_vector, bp_vector in self.backprops:
-            d_state_features = bp_vector((d_vector, ids))
-            ids = ids.flatten()
-            d_state_features = d_state_features.reshape(
-                (ids.size, d_state_features.shape[2]))
-            self.ops.scatter_add(d_tokvecs, ids, d_state_features)
-        # Padded -- see update()
-        self.bp_tokvecs(d_tokvecs[:-1])
-        return d_tokvecs
-
-
-NUMPY_OPS = NumpyOps()
-
-
-def step_forward(model: ParserStepModel, states, is_train):
-    token_ids = model.get_token_ids(states)
-    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
-    mask = None
-    if model.attrs["has_upper"]:
-        dropout_rate = model.attrs["dropout_rate"]
-        if is_train and dropout_rate > 0:
-            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
-            vector *= mask
-        scores, get_d_vector = model.vec2scores(vector, is_train)
-    else:
-        scores = NumpyOps().asarray(vector)
-        def get_d_vector(d_scores): return d_scores
-    # If the class is unseen, make sure its score is minimum
-    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
-
-    def backprop_parser_step(d_scores):
-        # Zero vectors for unseen classes
-        d_scores *= model._class_mask
-        d_vector = get_d_vector(d_scores)
-        if mask is not None:
-            d_vector *= mask
-        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
-        return None
-    return scores, backprop_parser_step
-
-
-cdef class precompute_hiddens:
-    """Allow a model to be "primed" by pre-computing input features in bulk.
-
-    This is used for the parser, where we want to take a batch of documents,
-    and compute vectors for each (token, position) pair. These vectors can then
-    be reused, especially for beam-search.
-
-    Let's say we're using 12 features for each state, e.g. word at start of
-    buffer, three words on stack, their children, etc. In the normal arc-eager
-    system, a document of length N is processed in 2*N states. This means we'll
-    create 2*N*12 feature vectors --- but if we pre-compute, we only need
-    N*12 vector computations. The saving for beam-search is much better:
-    if we have a beam of k, we'll normally make 2*N*12*K computations --
-    so we can save the factor k. This also gives a nice CPU/GPU division:
-    we can do all our hard maths up front, packed into large multiplications,
-    and do the hard-to-program parsing on the CPU.
-    """
-    cdef readonly int nF, nO, nP
-    cdef bint _is_synchronized
-    cdef public object ops
-    cdef public object numpy_ops
-    cdef public object _cpu_ops
-    cdef np.ndarray _features
-    cdef np.ndarray _cached
-    cdef np.ndarray bias
-    cdef object _cuda_stream
-    cdef object _bp_hiddens
-    cdef object activation
-
-    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
-                 activation="maxout", train=False):
-        gpu_cached, bp_features = lower_model(tokvecs, train)
-        cdef np.ndarray cached
-        if not isinstance(gpu_cached, numpy.ndarray):
-            # Note the passing of cuda_stream here: it lets
-            # cupy make the copy asynchronously.
-            # We then have to block before first use.
-            cached = gpu_cached.get(stream=cuda_stream)
-        else:
-            cached = gpu_cached
-        if not isinstance(lower_model.get_param("b"), numpy.ndarray):
-            self.bias = lower_model.get_param("b").get(stream=cuda_stream)
-        else:
-            self.bias = lower_model.get_param("b")
-        self.nF = cached.shape[1]
-        if lower_model.has_dim("nP"):
-            self.nP = lower_model.get_dim("nP")
-        else:
-            self.nP = 1
-        self.nO = cached.shape[2]
-        self.ops = lower_model.ops
-        self.numpy_ops = NumpyOps()
-        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
-        assert activation in (None, "relu", "maxout")
-        self.activation = activation
-        self._is_synchronized = False
-        self._cuda_stream = cuda_stream
-        self._cached = cached
-        self._bp_hiddens = bp_features
-
-    cdef const float* get_feat_weights(self) except NULL:
-        if not self._is_synchronized and self._cuda_stream is not None:
-            self._cuda_stream.synchronize()
-            self._is_synchronized = True
-        return <float*>self._cached.data
-
-    def has_dim(self, name):
-        if name == "nF":
-            return self.nF if self.nF is not None else True
-        elif name == "nP":
-            return self.nP if self.nP is not None else True
-        elif name == "nO":
-            return self.nO if self.nO is not None else True
-        else:
-            return False
-
-    def get_dim(self, name):
-        if name == "nF":
-            return self.nF
-        elif name == "nP":
-            return self.nP
-        elif name == "nO":
-            return self.nO
-        else:
-            raise ValueError(Errors.E1033.format(name=name))
-
-    def set_dim(self, name, value):
-        if name == "nF":
-            self.nF = value
-        elif name == "nP":
-            self.nP = value
-        elif name == "nO":
-            self.nO = value
-        else:
-            raise ValueError(Errors.E1033.format(name=name))
-
-    def __call__(self, X, bint is_train):
-        if is_train:
-            return self.begin_update(X)
-        else:
-            return self.predict(X), lambda X: X
-
-    def predict(self, X):
-        return self.begin_update(X)[0]
-
-    def begin_update(self, token_ids):
-        cdef np.ndarray state_vector = numpy.zeros(
-            (token_ids.shape[0], self.nO, self.nP), dtype='f')
-        # This is tricky, but (assuming GPU available);
-        # - Input to forward on CPU
-        # - Output from forward on CPU
-        # - Input to backward on GPU!
-        # - Output from backward on GPU
-        bp_hiddens = self._bp_hiddens
-
-        cdef CBlas cblas = self._cpu_ops.cblas()
-
-        feat_weights = self.get_feat_weights()
-        cdef int[:, ::1] ids = token_ids
-        sum_state_features(cblas, <float*>state_vector.data,
-                           feat_weights, &ids[0, 0], token_ids.shape[0],
-                           self.nF, self.nO*self.nP)
-        state_vector += self.bias
-        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
-
-        def backward(d_state_vector_ids):
-            d_state_vector, token_ids = d_state_vector_ids
-            d_state_vector = bp_nonlinearity(d_state_vector)
-            d_tokens = bp_hiddens((d_state_vector, token_ids))
-            return d_tokens
-        return state_vector, backward
-
-    def _nonlinearity(self, state_vector):
-        if self.activation == "maxout":
-            return self._maxout_nonlinearity(state_vector)
-        else:
-            return self._relu_nonlinearity(state_vector)
-
-    def _maxout_nonlinearity(self, state_vector):
-        state_vector, mask = self.numpy_ops.maxout(state_vector)
-        # We're outputting to CPU, but we need this variable on GPU for the
-        # backward pass.
-        mask = self.ops.asarray(mask)
-
-        def backprop_maxout(d_best):
-            return self.ops.backprop_maxout(d_best, mask, self.nP)
-
-        return state_vector, backprop_maxout
-
-    def _relu_nonlinearity(self, state_vector):
-        state_vector = state_vector.reshape((state_vector.shape[0], -1))
-        mask = state_vector >= 0.
-        state_vector *= mask
-        # We're outputting to CPU, but we need this variable on GPU for the
-        # backward pass.
-        mask = self.ops.asarray(mask)
-
-        def backprop_relu(d_best):
-            d_best *= mask
-            return d_best.reshape((d_best.shape + (1,)))
-
-        return state_vector, backprop_relu
-
-cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
-    if n_classes == 2:
-        return 0 if scores[0] > scores[1] else 1
-    cdef int i
-    cdef int best = 0
-    cdef float mode = scores[0]
-    for i in range(1, n_classes):
-        if scores[i] > mode:
-            mode = scores[i]
-            best = i
-    return best
diff --git a/spacy/ml/tb_framework.pxd b/spacy/ml/tb_framework.pxd
new file mode 100644
index 00000000000..965508519e8
--- /dev/null
+++ b/spacy/ml/tb_framework.pxd
@@ -0,0 +1,28 @@
+from libc.stdint cimport int8_t
+
+
+cdef struct SizesC:
+    int states
+    int classes
+    int hiddens
+    int pieces
+    int feats
+    int embed_width
+    int tokens
+
+
+cdef struct WeightsC:
+    const float* feat_weights
+    const float* feat_bias
+    const float* hidden_bias
+    const float* hidden_weights
+    const int8_t* seen_mask
+
+
+cdef struct ActivationsC:
+    int* token_ids
+    float* unmaxed
+    float* hiddens
+    int* is_valid
+    int _curr_size
+    int _max_size
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
deleted file mode 100644
index e351ad4e570..00000000000
--- a/spacy/ml/tb_framework.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from thinc.api import Model, noop
-
-from ..util import registry
-from .parser_model import ParserStepModel
-
-
-@registry.layers("spacy.TransitionModel.v1")
-def TransitionModel(
-    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
-):
-    """Set up a stepwise transition-based model"""
-    if upper is None:
-        has_upper = False
-        upper = noop()
-    else:
-        has_upper = True
-    # don't define nO for this object, because we can't dynamically change it
-    return Model(
-        name="parser_model",
-        forward=forward,
-        dims={"nI": tok2vec.maybe_get_dim("nI")},
-        layers=[tok2vec, lower, upper],
-        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
-        init=init,
-        attrs={
-            "has_upper": has_upper,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def forward(model, X, is_train):
-    step_model = ParserStepModel(
-        X,
-        model.layers,
-        unseen_classes=model.attrs["unseen_classes"],
-        train=is_train,
-        has_upper=model.attrs["has_upper"],
-    )
-
-    return step_model, step_model.finish_steps
-
-
-def init(model, X=None, Y=None):
-    model.get_ref("tok2vec").initialize(X=X)
-    lower = model.get_ref("lower")
-    lower.initialize()
-    if model.attrs["has_upper"]:
-        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
-        model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
new file mode 100644
index 00000000000..79be13b00bd
--- /dev/null
+++ b/spacy/ml/tb_framework.pyx
@@ -0,0 +1,621 @@
+# cython: infer_types=True, cdivision=True, boundscheck=False
+from typing import List, Tuple, Any, Optional, TypeVar, cast
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from libcpp.vector cimport vector
+import numpy
+cimport numpy as np
+from thinc.api import Model, normal_init, chain, list2array, Linear
+from thinc.api import uniform_init, glorot_uniform_init, zero_init
+from thinc.api import NumpyOps
+from thinc.backends.cblas cimport CBlas, saxpy, sgemm
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
+from thinc.types import Ints1d, Ints2d
+
+from ..errors import Errors
+from ..pipeline._parser_internals import _beam_utils
+from ..pipeline._parser_internals.batch import GreedyBatch
+from ..pipeline._parser_internals._parser_utils cimport arg_max
+from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
+from ..pipeline._parser_internals.transition_system cimport TransitionSystem
+from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..tokens.doc import Doc
+from ..util import registry
+
+
+State = Any  # TODO
+
+
+@registry.layers("spacy.TransitionModel.v2")
+def TransitionModel(
+    *,
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    beam_width: int = 1,
+    beam_density: float = 0.0,
+    state_tokens: int,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+    unseen_classes=set(),
+) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
+    """Set up a transition-based parsing model, using a maxout hidden
+    layer and a linear output layer.
+    """
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
+    tok2vec_projected.set_dim("nO", hidden_width)
+
+    # FIXME: we use `output` as a container for the output layer's
+    # weights and biases. Thinc optimizers cannot handle resizing
+    # of parameters. So, when the parser model is resized, we
+    # construct a new `output` layer, which has a different key in
+    # the optimizer. Once the optimizer supports parameter resizing,
+    # we can replace the `output` layer by `output_W` and `output_b`
+    # parameters in this model.
+    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
+
+    return Model(
+        name="parser_model",
+        forward=forward,
+        init=init,
+        layers=[tok2vec_projected, output],
+        refs={
+            "tok2vec": tok2vec_projected,
+            "output": output,
+        },
+        params={
+            "hidden_W": None,  # Floats2d W for the hidden layer
+            "hidden_b": None,  # Floats1d bias for the hidden layer
+            "hidden_pad": None,  # Floats1d padding for the hidden layer
+        },
+        dims={
+            "nO": None,  # Output size
+            "nP": maxout_pieces,
+            "nH": hidden_width,
+            "nI": tok2vec_projected.maybe_get_dim("nO"),
+            "nF": state_tokens,
+        },
+        attrs={
+            "beam_width": beam_width,
+            "beam_density": beam_density,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def resize_output(model: Model, new_nO: int) -> Model:
+    old_nO = model.maybe_get_dim("nO")
+    output = model.get_ref("output")
+    if old_nO is None:
+        model.set_dim("nO", new_nO)
+        output.set_dim("nO", new_nO)
+        output.initialize()
+        return model
+    elif new_nO <= old_nO:
+        return model
+    elif output.has_param("W"):
+        nH = model.get_dim("nH")
+        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
+        new_output.initialize()
+        new_W = new_output.get_param("W")
+        new_b = new_output.get_param("b")
+        old_W = output.get_param("W")
+        old_b = output.get_param("b")
+        new_W[:old_nO] = old_W  # type: ignore
+        new_b[:old_nO] = old_b  # type: ignore
+        for i in range(old_nO, new_nO):
+            model.attrs["unseen_classes"].add(i)
+        model.layers[-1] = new_output
+        model.set_ref("output", new_output)
+    # TODO: Avoid this private intrusion
+    model._dims["nO"] = new_nO
+    return model
+
+
+def init(
+    model,
+    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
+    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
+):
+    if X is not None:
+        docs, moves = X
+        model.get_ref("tok2vec").initialize(X=docs)
+    else:
+        model.get_ref("tok2vec").initialize()
+    inferred_nO = _infer_nO(Y)
+    if inferred_nO is not None:
+        current_nO = model.maybe_get_dim("nO")
+        if current_nO is None or current_nO != inferred_nO:
+            model.attrs["resize_output"](model, inferred_nO)
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nH = model.get_dim("nH")
+    nI = model.get_dim("nI")
+    nF = model.get_dim("nF")
+    ops = model.ops
+
+    Wl = ops.alloc2f(nH * nP, nF * nI)
+    bl = ops.alloc1f(nH * nP)
+    padl = ops.alloc1f(nI)
+    # Wl = zero_init(ops, Wl.shape)
+    Wl = glorot_uniform_init(ops, Wl.shape)
+    padl = uniform_init(ops, padl.shape)  # type: ignore
+    # TODO: Experiment with whether better to initialize output_W
+    model.set_param("hidden_W", Wl)
+    model.set_param("hidden_b", bl)
+    model.set_param("hidden_pad", padl)
+    # model = _lsuv_init(model)
+    return model
+
+
+class TransitionModelInputs:
+    """
+    Input to transition model.
+    """
+
+    # dataclass annotation is not yet supported in Cython 0.29.x,
+    # so, we'll do something close to it.
+
+    actions: Optional[List[Ints1d]]
+    docs: List[Doc]
+    max_moves: int
+    moves: TransitionSystem
+    states: Optional[List[State]]
+
+    __slots__ = [
+        "actions",
+        "docs",
+        "max_moves",
+        "moves",
+        "states",
+    ]
+
+    def __init__(
+        self,
+        docs: List[Doc],
+        moves: TransitionSystem,
+        actions: Optional[List[Ints1d]]=None,
+        max_moves: int=0,
+        states: Optional[List[State]]=None):
+        """
+        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
+        docs (List[Doc]): Docs to predict transition sequences for.
+        max_moves: (int): the maximum number of moves to apply, values less
+            than 1 will apply moves to states until they are final states.
+        moves (TransitionSystem): the transition system to use when predicting
+            the transition sequences.
+        states (Optional[List[States]]): the initial states to predict the
+            transition sequences for. When absent, the initial states are
+            initialized from the provided Docs.
+        """
+        self.actions = actions
+        self.docs = docs
+        self.moves = moves
+        self.max_moves = max_moves
+        self.states = states
+
+
+def forward(model, inputs: TransitionModelInputs, is_train: bool):
+    docs = inputs.docs
+    moves = inputs.moves
+    actions = inputs.actions
+
+    beam_width = model.attrs["beam_width"]
+    hidden_pad = model.get_param("hidden_pad")
+    tok2vec = model.get_ref("tok2vec")
+
+    states = moves.init_batch(docs) if inputs.states is None else inputs.states
+    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
+    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
+    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
+    seen_mask = _get_seen_mask(model)
+
+    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
+        # Note: max_moves is only used during training, so we don't need to
+        #       pass it to the greedy inference path.
+        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
+    else:
+        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
+            feats, backprop_feats, seen_mask, is_train, actions=actions,
+            max_moves=inputs.max_moves)
+
+
+def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
+                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+    cdef vector[StateC*] c_states
+    cdef StateClass state
+    for state in states:
+        if not state.is_final():
+            c_states.push_back(state.c)
+    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
+    # Precomputed features have rows for each token, plus one for padding.
+    cdef int n_tokens = feats.shape[0] - 1
+    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
+    cdef CBlas cblas = model.ops.cblas()
+    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
+
+    def backprop(dY):
+        raise ValueError(Errors.E4004)
+
+    return (states, scores), backprop
+
+cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
+                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
+    cdef int i, j
+    cdef vector[StateC *] unfinished
+    cdef ActivationsC activations = _alloc_activations(sizes)
+    cdef np.ndarray step_scores
+    cdef np.ndarray step_actions
+
+    scores = []
+    while sizes.states >= 1:
+        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
+        step_actions = actions[0] if actions is not None else None
+        with nogil:
+            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
+            if actions is None:
+                # Validate actions, argmax, take action.
+                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
+                    sizes.states)
+            else:
+                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
+            for i in range(sizes.states):
+                if not states[i].is_final():
+                    unfinished.push_back(states[i])
+            for i in range(unfinished.size()):
+                states[i] = unfinished[i]
+        sizes.states = unfinished.size()
+        scores.append(step_scores)
+        unfinished.clear()
+        actions = actions[1:] if actions is not None else None
+    _free_activations(&activations)
+
+    return scores
+
+
+def _forward_fallback(
+    model: Model,
+    moves: TransitionSystem,
+    states: List[StateClass],
+    tokvecs, backprop_tok2vec,
+    feats,
+    backprop_feats,
+    seen_mask,
+    is_train: bool,
+    actions: Optional[List[Ints1d]]=None,
+    max_moves: int=0):
+    nF = model.get_dim("nF")
+    output = model.get_ref("output")
+    hidden_b = model.get_param("hidden_b")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+
+    beam_width = model.attrs["beam_width"]
+    beam_density = model.attrs["beam_density"]
+
+    ops = model.ops
+
+    all_ids = []
+    all_which = []
+    all_statevecs = []
+    all_scores = []
+    if beam_width == 1:
+        batch = GreedyBatch(moves, states, None)
+    else:
+        batch = _beam_utils.BeamBatch(
+            moves, states, None, width=beam_width, density=beam_density
+        )
+    arange = ops.xp.arange(nF)
+    n_moves = 0
+    while not batch.is_done:
+        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
+        for i, state in enumerate(batch.get_unfinished_states()):
+            state.set_context_tokens(ids, i, nF)
+        # Sum the state features, add the bias and apply the activation (maxout)
+        # to create the state vectors.
+        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
+        preacts2f += hidden_b
+        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
+        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
+        statevecs, which = ops.maxout(preacts)
+        # We don't use output's backprop, since we want to backprop for
+        # all states at once, rather than a single state.
+        scores = output.predict(statevecs)
+        scores[:, seen_mask] = ops.xp.nanmin(scores)
+        # Transition the states, filtering out any that are finished.
+        cpu_scores = ops.to_numpy(scores)
+        if actions is None:
+            batch.advance(cpu_scores)
+        else:
+            batch.advance_with_actions(actions[0])
+            actions = actions[1:]
+        all_scores.append(scores)
+        if is_train:
+            # Remember intermediate results for the backprop.
+            all_ids.append(ids)
+            all_statevecs.append(statevecs)
+            all_which.append(which)
+        if n_moves >= max_moves >= 1:
+            break
+        n_moves += 1
+
+    def backprop_parser(d_states_d_scores):
+        ids = ops.xp.vstack(all_ids)
+        which = ops.xp.vstack(all_which)
+        statevecs = ops.xp.vstack(all_statevecs)
+        _, d_scores = d_states_d_scores
+        if model.attrs.get("unseen_classes"):
+            # If we have a negative gradient (i.e. the probability should
+            # increase) on any classes we filtered out as unseen, mark
+            # them as seen.
+            for clas in set(model.attrs["unseen_classes"]):
+                if (d_scores[:, clas] < 0).any():
+                    model.attrs["unseen_classes"].remove(clas)
+        d_scores *= seen_mask == False
+        # Calculate the gradients for the parameters of the output layer.
+        # The weight gemm is (nS, nO) @ (nS, nH).T
+        output.inc_grad("b", d_scores.sum(axis=0))
+        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
+        # Now calculate d_statevecs, by backproping through the output linear layer.
+        # This gemm is (nS, nO) @ (nO, nH)
+        output_W = output.get_param("W")
+        d_statevecs = ops.gemm(d_scores, output_W)
+        # Backprop through the maxout activation
+        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
+        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
+        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
+        # We don't need to backprop the summation, because we pass back the IDs instead
+        d_state_features = backprop_feats((d_preacts2f, ids))
+        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
+        ops.scatter_add(d_tokvecs, ids, d_state_features)
+        model.inc_grad("hidden_pad", d_tokvecs[-1])
+        return (backprop_tok2vec(d_tokvecs[:-1]), None)
+
+    return (list(batch), all_scores), backprop_parser
+
+
+def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
+    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
+    for class_ in model.attrs.get("unseen_classes", set()):
+        mask[class_] = True
+    return mask
+
+
+def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
+    W: Floats2d = model.get_param("hidden_W")
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
+    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
+    W3f = W3f.transpose((1, 0, 2))
+    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
+    assert X.shape == (X.shape[0], nI), X.shape
+    Yf_ = model.ops.gemm(X, W2f, trans2=True)
+    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
+
+    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nH, nP), and get back:
+        # (nB, nH, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nH, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        dXf = model.ops.gemm(dY, W)
+        Xf = X[ids].reshape((ids.shape[0], -1))
+        dW = model.ops.gemm(dY, Xf, trans1=True)
+        model.inc_grad("hidden_W", dW)
+        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
+
+    return Yf, backward
+
+
+def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
+    if Y is None:
+        return None
+    _, scores = Y
+    if len(scores) == 0:
+        return None
+    assert scores[0].shape[0] >= 1
+    assert len(scores[0].shape) == 2
+    return scores[0].shape[1]
+
+
+def _lsuv_init(model: Model):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    W = model.maybe_get_param("hidden_W")
+    if W is not None and W.any():
+        return
+
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nH, nP, nI)
+    b = model.ops.alloc2f(nH, nP)
+    pad = model.ops.alloc4f(1, nF, nH, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc_f((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc_f((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
+        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
+        vectors3f += b
+        return model.ops.maxout(vectors3f)[0]
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = cast(Floats4d, model.get_param("hidden_W").copy())
+    b = cast(Floats2d, model.get_param("hidden_b").copy())
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("hidden_W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("hidden_b", b)
+        else:
+            break
+    return model
+
+
+cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
+    output = model.get_ref("output")
+    cdef np.ndarray hidden_b = model.get_param("hidden_b")
+    cdef np.ndarray output_W = output.get_param("W")
+    cdef np.ndarray output_b = output.get_param("b")
+
+    cdef WeightsC weights
+    weights.feat_weights = feats
+    weights.feat_bias = <const float*>hidden_b.data
+    weights.hidden_weights = <const float *> output_W.data
+    weights.hidden_bias = <const float *> output_b.data
+    weights.seen_mask = <const int8_t*> seen_mask.data
+
+    return weights
+
+
+cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
+    cdef SizesC sizes
+    sizes.states = batch_size
+    sizes.classes = model.get_dim("nO")
+    sizes.hiddens = model.get_dim("nH")
+    sizes.pieces = model.get_dim("nP")
+    sizes.feats = model.get_dim("nF")
+    sizes.embed_width = model.get_dim("nI")
+    sizes.tokens = tokens
+    return sizes
+
+
+cdef ActivationsC _alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    _resize_activations(&A, n)
+    return A
+
+
+cdef void _free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
+cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
+    if n.states <= A._max_size:
+        A._curr_size = n.states
+        return
+    if A._max_size == 0:
+        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
+        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    else:
+        A.token_ids = <int*>realloc(A.token_ids,
+            n.states * n.feats * sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,
+            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>realloc(A.hiddens,
+            n.states * n.hiddens * sizeof(A.hiddens[0]))
+        A.is_valid = <int*>realloc(A.is_valid,
+            n.states * n.classes * sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    A._curr_size = n.states
+
+
+cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
+    _resize_activations(A, n)
+    for i in range(n.states):
+        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
+    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
+    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
+    for i in range(n.states):
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        for j in range(n.hiddens):
+            index = i * n.hiddens * n.pieces + j * n.pieces
+            which = arg_max(&A.unmaxed[index], n.pieces)
+            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
+    if W.hidden_weights == NULL:
+        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
+    else:
+        # Compute hidden-to-output
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
+                      1.0, <const float *>A.hiddens, n.hiddens,
+                      <const float *>W.hidden_weights, n.hiddens,
+                      0.0, scores, n.classes)
+        # Add bias
+        for i in range(n.states):
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
+    # Set unseen classes to minimum value
+    i = 0
+    min_ = scores[0]
+    for i in range(1, n.states * n.classes):
+        if scores[i] < min_:
+            min_ = scores[i]
+    for i in range(n.states):
+        for j in range(n.classes):
+            if W.seen_mask[j]:
+                scores[i*n.classes+j] = min_
+
+
+cdef void _sum_state_features(CBlas cblas, float* output,
+        const float* cached, const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    cdef int B = n.states
+    cdef int O = n.hiddens * n.pieces
+    cdef int F = n.feats
+    cdef int T = n.tokens
+    padding = cached + (T * F * O)
+    cdef int id_stride = F*O
+    cdef float one = 1.
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                feature = &padding[f*O]
+            else:
+                idx = token_ids[f] * id_stride + f*O
+                feature = &cached[idx]
+            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
+        token_ids += F
+
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index 84e30bab396..327423924a1 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -9,6 +9,7 @@ from ...typedefs cimport class_t
 from .transition_system cimport Transition, TransitionSystem
 
 from ...errors import Errors
+from .batch cimport Batch
 from .search cimport Beam, MaxViolation
 from .search import MaxViolation
 from .stateclass cimport StateC, StateClass
@@ -28,7 +29,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1:
     return state.is_final()
 
 
-cdef class BeamBatch(object):
+cdef class BeamBatch(Batch):
     cdef public TransitionSystem moves
     cdef public object states
     cdef public object docs
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pxd b/spacy/pipeline/_parser_internals/_parser_utils.pxd
new file mode 100644
index 00000000000..7fee05bad60
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pxd
@@ -0,0 +1,2 @@
+cdef int arg_max(const float* scores, const int n_classes) nogil
+cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pyx b/spacy/pipeline/_parser_internals/_parser_utils.pyx
new file mode 100644
index 00000000000..582756bf5be
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pyx
@@ -0,0 +1,22 @@
+# cython: infer_types=True
+
+cdef inline int arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
+cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if is_valid[i] >= 1:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index c063cf97cd4..1b6b25e5f16 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -8,6 +8,7 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
@@ -27,7 +28,7 @@ cdef struct ArcC:
 
 
 cdef cppclass StateC:
-    int* _heads
+    vector[int] _heads
     const TokenC* _sent
     vector[int] _stack
     vector[int] _rebuffer
@@ -35,31 +36,34 @@ cdef cppclass StateC:
     unordered_map[int, vector[ArcC]] _left_arcs
     unordered_map[int, vector[ArcC]] _right_arcs
     vector[libcpp.bool] _unshiftable
+    vector[int] history
     set[int] _sent_starts
     TokenC _empty_token
     int length
     int offset
     int _b_i
 
-    __init__(const TokenC* sent, int length) nogil:
+    __init__(const TokenC* sent, int length) nogil except +:
+        this._heads.resize(length, -1)
+        this._unshiftable.resize(length, False)
+
+        # Reserve memory ahead of time to minimize allocations during parsing.
+        # The initial capacity set here ideally reflects the expected average-case/majority usage.
+        cdef int init_capacity = 32
+        this._stack.reserve(init_capacity)
+        this._rebuffer.reserve(init_capacity)
+        this._ents.reserve(init_capacity)
+        this._left_arcs.reserve(init_capacity)
+        this._right_arcs.reserve(init_capacity)
+        this.history.reserve(init_capacity)
+
         this._sent = sent
-        this._heads = <int*>calloc(length, sizeof(int))
-        if not (this._sent and this._heads):
-            with gil:
-                PyErr_SetFromErrno(MemoryError)
-                PyErr_CheckSignals()
         this.offset = 0
         this.length = length
         this._b_i = 0
-        for i in range(length):
-            this._heads[i] = -1
-            this._unshiftable.push_back(0)
         memset(&this._empty_token, 0, sizeof(TokenC))
         this._empty_token.lex = &EMPTY_LEXEME
 
-    __dealloc__():
-        free(this._heads)
-
     void set_context_tokens(int* ids, int n) nogil:
         cdef int i, j
         if n == 1:
@@ -132,19 +136,20 @@ cdef cppclass StateC:
                 ids[i] = -1
 
     int S(int i) nogil const:
-        if i >= this._stack.size():
+        cdef int stack_size = this._stack.size()
+        if i >= stack_size or i < 0:
             return -1
-        elif i < 0:
-            return -1
-        return this._stack.at(this._stack.size() - (i+1))
+        else:
+            return this._stack[stack_size - (i+1)]
 
     int B(int i) nogil const:
+        cdef int buf_size = this._rebuffer.size()
         if i < 0:
             return -1
-        elif i < this._rebuffer.size():
-            return this._rebuffer.at(this._rebuffer.size() - (i+1))
+        elif i < buf_size:
+            return this._rebuffer[buf_size - (i+1)]
         else:
-            b_i = this._b_i + (i - this._rebuffer.size())
+            b_i = this._b_i + (i - buf_size)
             if b_i >= this.length:
                 return -1
             else:
@@ -243,7 +248,7 @@ cdef cppclass StateC:
             return 0
         elif this._sent[word].sent_start == 1:
             return 1
-        elif this._sent_starts.count(word) >= 1:
+        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
             return 1
         else:
             return 0
@@ -327,7 +332,7 @@ cdef cppclass StateC:
         if item >= this._unshiftable.size():
             return 0
         else:
-            return this._unshiftable.at(item)
+            return this._unshiftable[item]
 
     void set_reshiftable(int item) nogil:
         if item < this._unshiftable.size():
@@ -347,6 +352,9 @@ cdef cppclass StateC:
         this._heads[child] = head
 
     void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
+        cdef vector[ArcC]* arcs
+        cdef ArcC* arc
+
         arcs_it = heads_arcs.find(h_i)
         if arcs_it == heads_arcs.end():
             return
@@ -355,12 +363,12 @@ cdef cppclass StateC:
         if arcs.size() == 0:
             return
 
-        arc = arcs.back()
+        arc = &arcs.back()
         if arc.head == h_i and arc.child == c_i:
             arcs.pop_back()
         else:
             for i in range(arcs.size()-1):
-                arc = arcs.at(i)
+                arc = &deref(arcs)[i]
                 if arc.head == h_i and arc.child == c_i:
                     arc.head = -1
                     arc.child = -1
@@ -400,10 +408,11 @@ cdef cppclass StateC:
         this._rebuffer = src._rebuffer
         this._sent_starts = src._sent_starts
         this._unshiftable = src._unshiftable
-        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
+        this._heads = src._heads
         this._ents = src._ents
         this._left_arcs = src._left_arcs
         this._right_arcs = src._right_arcs
         this._b_i = src._b_i
         this.offset = src.offset
         this._empty_token = src._empty_token
+        this.history = src.history
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 11e3f483a79..66d6fb9acf5 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -778,6 +778,8 @@ cdef class ArcEager(TransitionSystem):
         return list(arcs)
 
     def has_gold(self, Example eg, start=0, end=None):
+        if end is not None and end < 0:
+            end = None
         for word in eg.y[start:end]:
             if word.dep != 0:
                 return True
@@ -862,6 +864,7 @@ cdef class ArcEager(TransitionSystem):
                             state.print_state()
                         )))
                     action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                     break
             else:
                 failed = False
diff --git a/spacy/pipeline/_parser_internals/batch.pxd b/spacy/pipeline/_parser_internals/batch.pxd
new file mode 100644
index 00000000000..60734e549aa
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/batch.pxd
@@ -0,0 +1,2 @@
+cdef class Batch:
+    pass
diff --git a/spacy/pipeline/_parser_internals/batch.pyx b/spacy/pipeline/_parser_internals/batch.pyx
new file mode 100644
index 00000000000..91073b52e68
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/batch.pyx
@@ -0,0 +1,52 @@
+from typing import Any
+
+TransitionSystem = Any  # TODO
+
+cdef class Batch:
+    def advance(self, scores):
+        raise NotImplementedError
+
+    def get_states(self):
+        raise NotImplementedError
+
+    @property
+    def is_done(self):
+        raise NotImplementedError
+
+    def get_unfinished_states(self):
+        raise NotImplementedError
+
+    def __getitem__(self, i):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+
+class GreedyBatch(Batch):
+    def __init__(self, moves: TransitionSystem, states, golds):
+        self._moves = moves
+        self._states = states
+        self._next_states = [s for s in states if not s.is_final()]
+
+    def advance(self, scores):
+        self._next_states = self._moves.transition_states(self._next_states, scores)
+
+    def advance_with_actions(self, actions):
+        self._next_states = self._moves.apply_actions(self._next_states, actions)
+
+    def get_states(self):
+        return self._states
+
+    @property
+    def is_done(self):
+        return all(s.is_final() for s in self._states)
+
+    def get_unfinished_states(self):
+        return [st for st in self._states if not st.is_final()]
+
+    def __getitem__(self, i):
+        return self._states[i]
+
+    def __len__(self):
+        return len(self._states)
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 324a497c9fb..a8c72f238f6 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -316,6 +316,8 @@ cdef class BiluoPushDown(TransitionSystem):
             for span in eg.y.spans.get(neg_key, []):
                 if span.start >= start and span.end <= end:
                     return True
+        if end is not None and end < 0:
+            end = None
         for word in eg.y[start:end]:
             if word.ent_iob != 0:
                 return True
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index f25408a13ba..c2a0d22956a 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -21,6 +21,10 @@ cdef class StateClass:
         if self._borrowed != 1:
             del self.c
 
+    @property
+    def history(self):
+        return list(self.c.history)
+
     @property
     def stack(self):
         return [self.S(i) for i in range(self.c.stack_depth())]
@@ -177,3 +181,6 @@ cdef class StateClass:
 
     def clone(self, StateClass src):
         self.c.clone(src.c)
+
+    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
+        self.c.set_context_tokens(&output[row, 0], n_feats)
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 04cd10d8864..66cc7747b69 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -57,3 +57,10 @@ cdef class TransitionSystem:
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        const StateC* state, gold) except -1
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+        int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index 485ce7c10bd..7bd39ba43c5 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -3,6 +3,8 @@
 from __future__ import print_function
 
 from cymem.cymem cimport Pool
+from libc.stdlib cimport calloc, free
+from libcpp.vector cimport vector
 
 from collections import Counter
 
@@ -10,6 +12,7 @@ import srsly
 
 from ...structs cimport TokenC
 from .stateclass cimport StateClass
+from ._parser_utils cimport arg_max_if_valid
 
 from ... import util
 from ...errors import Errors
@@ -73,7 +76,18 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
+    def follow_history(self, doc, history):
+        cdef int clas
+        cdef StateClass state = StateClass(doc)
+        for clas in history:
+            action = self.c[clas]
+            action.do(state.c, action.label)
+            state.c.history.push_back(clas)
+        return state
+
     def get_oracle_sequence(self, Example example, _debug=False):
+        if not self.has_gold(example):
+            return []
         states, golds, _ = self.init_gold_batch([example])
         if not states:
             return []
@@ -85,6 +99,8 @@ cdef class TransitionSystem:
             return self.get_oracle_sequence_from_state(state, gold)
 
     def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
+        if state.is_final():
+            return []
         cdef Pool mem = Pool()
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         assert self.n_moves > 0
@@ -110,6 +126,7 @@ cdef class TransitionSystem:
                             "S0 head?", str(state.has_head(state.S(0))),
                         )))
                     action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                     break
             else:
                 if _debug:
@@ -137,6 +154,28 @@ cdef class TransitionSystem:
             raise ValueError(Errors.E170.format(name=name))
         action = self.lookup_transition(name)
         action.do(state.c, action.label)
+        state.c.history.push_back(action.clas)
+
+    def apply_actions(self, states, const int[::1] actions):
+        assert len(states) == actions.shape[0]
+        cdef StateClass state
+        cdef vector[StateC*] c_states
+        c_states.resize(len(states))
+        cdef int i
+        for (i, state) in enumerate(states):
+            c_states[i] = state.c
+        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
+        return [state for state in states if not state.c.is_final()]
+
+    def transition_states(self, states, float[:, ::1] scores):
+        assert len(states) == scores.shape[0]
+        cdef StateClass state
+        cdef float* c_scores = &scores[0, 0]
+        cdef vector[StateC*] c_states
+        for state in states:
+            c_states.push_back(state.c)
+        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
+        return [state for state in states if not state.c.is_final()]
 
     cdef Transition lookup_transition(self, object name) except *:
         raise NotImplementedError
@@ -249,3 +288,35 @@ cdef class TransitionSystem:
             self.cfg.update(msg['cfg'])
         self.initialize_actions(labels)
         return self
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil:
+        cdef int i
+        cdef Transition action
+        cdef StateC* state
+        for i in range(batch_size):
+            state = states[i]
+            action = moves.c[actions[i]]
+            action.do(state, action.label)
+            state.history.push_back(action.clas)
+
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+    int nr_class, int batch_size) nogil:
+    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
+    cdef int i, guess
+    cdef Transition action
+    for i in range(batch_size):
+        moves.set_valid(is_valid, states[i])
+        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
+        if guess == -1:
+            # This shouldn't happen, but it's hard to raise an error here,
+            # and we don't want to infinite loop. So, force to end state.
+            states[i].force_final()
+        else:
+            action = moves.c[guess]
+            action.do(states[i], action.label)
+            states[i].history.push_back(guess)
+    free(is_valid)
+
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.py
similarity index 97%
rename from spacy/pipeline/dep_parser.pyx
rename to spacy/pipeline/dep_parser.py
index cbd7187ff0f..c996074d2c4 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.py
@@ -5,6 +5,8 @@
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
+from ._parser_internals.arc_eager import ArcEager
 
 from ._parser_internals.arc_eager cimport ArcEager
 from .transition_parser cimport Parser
@@ -22,12 +24,11 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -233,6 +234,7 @@ def parser_score(examples, **kwargs):
 
     DOCS: https://spacy.io/api/dependencyparser#score
     """
+
     def has_sents(doc):
         return doc.has_annotation("SENT_START")
 
@@ -240,8 +242,11 @@ def dep_getter(token, attr):
         dep = getattr(token, attr)
         dep = token.vocab.strings.as_string(dep).lower()
         return dep
+
     results = {}
-    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
+    results.update(
+        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
+    )
     kwargs.setdefault("getter", dep_getter)
     kwargs.setdefault("ignore_labels", ("p", "punct"))
     results.update(Scorer.score_deps(examples, "dep", **kwargs))
@@ -254,11 +259,12 @@ def make_parser_scorer():
     return parser_score
 
 
-cdef class DependencyParser(Parser):
+class DependencyParser(Parser):
     """Pipeline component for dependency parsing.
 
     DOCS: https://spacy.io/api/dependencyparser
     """
+
     TransitionSystem = ArcEager
 
     def __init__(
@@ -278,8 +284,7 @@ def __init__(
         incorrect_spans_key=None,
         scorer=parser_score,
     ):
-        """Create a DependencyParser.
-        """
+        """Create a DependencyParser."""
         super().__init__(
             vocab,
             model,
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.py
similarity index 93%
rename from spacy/pipeline/ner.pyx
rename to spacy/pipeline/ner.py
index fe54d33a17b..41280c49390 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.py
@@ -10,6 +10,13 @@
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
+from ._parser_internals.ner import BiluoPushDown
+from ..language import Language
+from ..scorer import get_ner_prf, PRFScore
+from ..training import validate_examples
+from ..util import registry
+from ..training import remove_bilu_prefix
 
 from ._parser_internals.ner cimport BiluoPushDown
 from .transition_parser cimport Parser
@@ -21,12 +28,11 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -51,8 +57,12 @@
         "incorrect_spans_key": None,
         "scorer": {"@scorers": "spacy.ner_scorer.v1"},
     },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
-
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_ner(
     nlp: Language,
@@ -119,7 +129,12 @@ def make_ner(
         "incorrect_spans_key": None,
         "scorer": None,
     },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_beam_ner(
     nlp: Language,
@@ -193,11 +208,12 @@ def make_ner_scorer():
     return ner_score
 
 
-cdef class EntityRecognizer(Parser):
+class EntityRecognizer(Parser):
     """Pipeline component for named entity recognition.
 
     DOCS: https://spacy.io/api/entityrecognizer
     """
+
     TransitionSystem = BiluoPushDown
 
     def __init__(
@@ -215,15 +231,14 @@ def __init__(
         incorrect_spans_key=None,
         scorer=ner_score,
     ):
-        """Create an EntityRecognizer.
-        """
+        """Create an EntityRecognizer."""
         super().__init__(
             vocab,
             model,
             name,
             moves,
             update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,   # not relevant for NER
+            min_action_freq=1,  # not relevant for NER
             learn_tokens=False,  # not relevant for NER
             beam_width=beam_width,
             beam_density=beam_density,
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
deleted file mode 100644
index 7adb82213de..00000000000
--- a/spacy/pipeline/transition_parser.pxd
+++ /dev/null
@@ -1,21 +0,0 @@
-from cymem.cymem cimport Pool
-from thinc.backends.cblas cimport CBlas
-
-from ..ml.parser_model cimport ActivationsC, SizesC, WeightsC
-from ..vocab cimport Vocab
-from ._parser_internals._state cimport StateC
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
-from .trainable_pipe cimport TrainablePipe
-
-
-cdef class Parser(TrainablePipe):
-    cdef public object _rehearsal_model
-    cdef readonly TransitionSystem moves
-    cdef public object _multitasks
-    cdef object _cpu_ops
-
-    cdef void _parseC(self, CBlas cblas, StateC** states,
-                      WeightsC weights, SizesC sizes) nogil
-
-    cdef void c_transition_batch(self, StateC** states, const float* scores,
-                                 int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index e0449f188b5..4653fd7e600 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -13,41 +13,29 @@ from libc.string cimport memset
 from libcpp.vector cimport vector
 
 import random
+import contextlib
 
 import srsly
 from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
-from thinc.api import chain, softmax_activation, use_ops
+from thinc.api import chain, softmax_activation, use_ops, get_array_module
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d
+from thinc.types import Floats2d, Ints1d
 import numpy.random
 import numpy
 import numpy.random
 import srsly
-from thinc.api import (
-    CupyOps,
-    NumpyOps,
-    Optimizer,
-    SequenceCategoricalCrossentropy,
-    chain,
-    get_ops,
-    set_dropout_rate,
-    softmax_activation,
-    use_ops,
-)
-from thinc.types import Floats2d
+from thinc.api import CupyOps, NumpyOps, set_dropout_rate
 
-from ._parser_internals.stateclass cimport StateClass
+from ..ml.tb_framework import TransitionModelInputs
+from ._parser_internals.stateclass cimport StateC, StateClass
 from ._parser_internals.search cimport Beam
-from ..ml.parser_model cimport alloc_activations, free_activations
-from ..ml.parser_model cimport predict_states, arg_max_if_valid
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
-from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
-from ._parser_internals.stateclass cimport StateClass
-
-from .trainable_pipe import TrainablePipe
-
+from .trainable_pipe cimport TrainablePipe
 from ._parser_internals cimport _beam_utils
+from ._parser_internals import _beam_utils
+from ..vocab cimport Vocab
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ..typedefs cimport weight_t
 
 from ..training import validate_examples, validate_get_examples
 from ..training import validate_distillation_examples
@@ -81,7 +69,7 @@ cdef extern from "<algorithm>" namespace "std" nogil:
 NUMPY_OPS = NumpyOps()
 
 
-cdef class Parser(TrainablePipe):
+class Parser(TrainablePipe):
     """
     Base class of the DependencyParser and EntityRecognizer.
     """
@@ -181,8 +169,9 @@ cdef class Parser(TrainablePipe):
     @property
     def move_names(self):
         names = []
+        cdef TransitionSystem moves = self.moves
         for i in range(self.moves.n_moves):
-            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
+            name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
             # Explicitly removing the internal "U-" token used for blocking entities
             if name != "U-":
                 names.append(name)
@@ -288,15 +277,6 @@ cdef class Parser(TrainablePipe):
 
         student_docs = [eg.predicted for eg in examples]
 
-        teacher_step_model = teacher_pipe.model.predict([eg.reference for eg in examples])
-        student_step_model, backprop_tok2vec = self.model.begin_update(student_docs)
-
-        # Add softmax activation, so that we can compute student losses
-        # with cross-entropy loss.
-        with use_ops("numpy"):
-            teacher_model = chain(teacher_step_model, softmax_activation())
-            student_model = chain(student_step_model, softmax_activation())
-
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
@@ -304,50 +284,38 @@ cdef class Parser(TrainablePipe):
             # sequence, we use the teacher's predictions as the gold
             # standard.
             max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states = self._init_batch(teacher_step_model, student_docs, max_moves)
+            states = self._init_batch(teacher_pipe, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
-        loss = 0.0
-        n_moves = 0
-        while states:
-            # We do distillation as follows: (1) for every state, we compute the
-            # transition softmax distributions: (2) we backpropagate the error of
-            # the student (compared to the teacher) into the student model; (3)
-            # for all states, we move to the next state using the student's
-            # predictions.
-            teacher_scores = teacher_model.predict(states)
-            student_scores, backprop = student_model.begin_update(states)
-            state_loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
-            backprop(d_scores)
-            loss += state_loss
-            self.transition_states(states, student_scores)
-            states = [state for state in states if not state.is_final()]
-
-            # Stop when we reach the maximum number of moves, otherwise we start
-            # to process the remainder of cut sequences again.
-            if max_moves >= 1 and n_moves >= max_moves:
-                break
-            n_moves += 1
+        # We distill as follows: 1. we first let the student predict transition
+        # sequences (and the corresponding transition probabilities); (2) we
+        # let the teacher follow the student's predicted transition sequences
+        # to obtain the teacher's transition probabilities; (3) we compute the
+        # gradients of the student's transition distributions relative to the
+        # teacher's distributions.
+
+        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
+            max_moves=max_moves)
+        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
+        actions = states2actions(student_states)
+        teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
+            moves=self.moves, actions=actions)
+        (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
-        backprop_tok2vec(student_docs)
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+        backprop_scores((student_states, d_scores))
 
         if sgd is not None:
             self.finish_update(sgd)
 
         losses[self.name] += loss
 
-        del backprop
-        del backprop_tok2vec
-        teacher_step_model.clear_memory()
-        student_step_model.clear_memory()
-        del teacher_model
-        del student_model
-
         return losses
 
     def get_teacher_student_loss(
-        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
+        normalize: bool=False,
     ) -> Tuple[float, List[Floats2d]]:
         """Calculate the loss and its gradient for a batch of student
         scores, relative to teacher scores.
@@ -359,10 +327,28 @@ cdef class Parser(TrainablePipe):
 
         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
-        d_scores, loss = loss_func(student_scores, teacher_scores)
-        if self.model.ops.xp.isnan(loss):
-            raise ValueError(Errors.E910.format(name=self.name))
+
+        # We can't easily hook up a softmax layer in the parsing model, since
+        # the get_loss does additional masking. So, we could apply softmax
+        # manually here and use Thinc's cross-entropy loss. But it's a bit
+        # suboptimal, since we can have a lot of states that would result in
+        # many kernel launches. Futhermore the parsing model's backprop expects
+        # a XP array, so we'd have to concat the softmaxes anyway. So, like
+        # the get_loss implementation, we'll compute the loss and gradients
+        # ourselves.
+
+        teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
+            axis=-1, inplace=True)
+        student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
+            axis=-1, inplace=True)
+
+        assert teacher_scores.shape == student_scores.shape
+
+        d_scores = student_scores - teacher_scores
+        if normalize:
+            d_scores /= d_scores.shape[0]
+        loss = (d_scores**2).sum() / d_scores.size
+
         return float(loss), d_scores
 
     def init_multitask_objectives(self, get_examples, pipeline, **cfg):
@@ -385,9 +371,6 @@ cdef class Parser(TrainablePipe):
 
         stream: The sequence of documents to process.
         batch_size (int): Number of documents to accumulate into a working set.
-        error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
-            deals with a failing batch of documents. The default function just reraises
-            the exception.
 
         YIELDS (Doc): Documents, in order.
         """
@@ -408,76 +391,29 @@ cdef class Parser(TrainablePipe):
     def predict(self, docs):
         if isinstance(docs, Doc):
             docs = [docs]
+        self._ensure_labels_are_added(docs)
         if not any(len(doc) for doc in docs):
             result = self.moves.init_batch(docs)
             return result
-        if self.cfg["beam_width"] == 1:
-            return self.greedy_parse(docs, drop=0.0)
-        else:
-            return self.beam_parse(
-                docs,
-                drop=0.0,
-                beam_width=self.cfg["beam_width"],
-                beam_density=self.cfg["beam_density"]
-            )
+        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            states_or_beams, _ = self.model.predict(inputs)
+        return states_or_beams
 
     def greedy_parse(self, docs, drop=0.):
-        cdef vector[StateC*] states
-        cdef StateClass state
-        cdef CBlas cblas = self._cpu_ops.cblas()
+        self._resize()
         self._ensure_labels_are_added(docs)
-        set_dropout_rate(self.model, drop)
-        batch = self.moves.init_batch(docs)
-        model = self.model.predict(docs)
-        weights = get_c_weights(model)
-        for state in batch:
-            if not state.is_final():
-                states.push_back(state.c)
-        sizes = get_c_sizes(model, states.size())
-        with nogil:
-            self._parseC(cblas, &states[0], weights, sizes)
-        model.clear_memory()
-        del model
-        return batch
+        with _change_attrs(self.model, beam_width=1):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            states, _ = self.model.predict(inputs)
+        return states
 
     def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
         self._ensure_labels_are_added(docs)
-        batch = _beam_utils.BeamBatch(
-            self.moves,
-            self.moves.init_batch(docs),
-            None,
-            beam_width,
-            density=beam_density
-        )
-        model = self.model.predict(docs)
-        while not batch.is_done:
-            states = batch.get_unfinished_states()
-            if not states:
-                break
-            scores = model.predict(states)
-            batch.advance(scores)
-        model.clear_memory()
-        del model
-        return list(batch)
-
-    cdef void _parseC(self, CBlas cblas, StateC** states,
-                      WeightsC weights, SizesC sizes) nogil:
-        cdef int i
-        cdef vector[StateC*] unfinished
-        cdef ActivationsC activations = alloc_activations(sizes)
-        while sizes.states >= 1:
-            predict_states(cblas, &activations, states, &weights, sizes)
-            # Validate actions, argmax, take action.
-            self.c_transition_batch(states, activations.scores,
-                                    sizes.classes, sizes.states)
-            for i in range(sizes.states):
-                if not states[i].is_final():
-                    unfinished.push_back(states[i])
-            for i in range(unfinished.size()):
-                states[i] = unfinished[i]
-            sizes.states = unfinished.size()
-            unfinished.clear()
-        free_activations(&activations)
+        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
+            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+            beams, _ = self.model.predict(inputs)
+        return beams
 
     def set_annotations(self, docs, states_or_beams):
         cdef StateClass state
@@ -488,35 +424,6 @@ cdef class Parser(TrainablePipe):
             for hook in self.postprocesses:
                 hook(doc)
 
-    def transition_states(self, states, float[:, ::1] scores):
-        cdef StateClass state
-        cdef float* c_scores = &scores[0, 0]
-        cdef vector[StateC*] c_states
-        for state in states:
-            c_states.push_back(state.c)
-        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
-        return [state for state in states if not state.c.is_final()]
-
-    cdef void c_transition_batch(self, StateC** states, const float* scores,
-                                 int nr_class, int batch_size) nogil:
-        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        with gil:
-            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
-        is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
-        cdef int i, guess
-        cdef Transition action
-        for i in range(batch_size):
-            self.moves.set_valid(is_valid, states[i])
-            guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
-            if guess == -1:
-                # This shouldn't happen, but it's hard to raise an error here,
-                # and we don't want to infinite loop. So, force to end state.
-                states[i].force_final()
-            else:
-                action = self.moves.c[guess]
-                action.do(states[i], action.label)
-        free(is_valid)
-
     def update(self, examples, *, drop=0., sgd=None, losses=None):
         if losses is None:
             losses = {}
@@ -527,66 +434,99 @@ cdef class Parser(TrainablePipe):
         )
         for multitask in self._multitasks:
             multitask.update(examples, drop=drop, sgd=sgd)
+        # We need to take care to act on the whole batch, because we might be
+        # getting vectors via a listener.
         n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
         if n_examples == 0:
             return losses
         set_dropout_rate(self.model, drop)
-        # The probability we use beam update, instead of falling back to
-        # a greedy update
-        beam_update_prob = self.cfg["beam_update_prob"]
-        if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
-            return self.update_beam(
-                examples,
-                beam_width=self.cfg["beam_width"],
-                sgd=sgd,
-                losses=losses,
-                beam_density=self.cfg["beam_density"]
-            )
+        docs = [eg.x for eg in examples if len(eg.x)]
+
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
             # batch uniform length.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states, golds, _ = self._init_gold_batch(
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
+            init_states, gold_states, _ = self._init_gold_batch(
                 examples,
                 max_length=max_moves
             )
         else:
-            states, golds, _ = self.moves.init_gold_batch(examples)
-        if not states:
-            return losses
-        model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
-
-        states_golds = list(zip(states, golds))
-        n_moves = 0
-        while states_golds:
-            states, golds = zip(*states_golds)
-            scores, backprop = model.begin_update(states)
-            d_scores = self.get_batch_loss(states, golds, scores, losses)
-            # Note that the gradient isn't normalized by the batch size
-            # here, because our "samples" are really the states...But we
-            # can't normalize by the number of states either, as then we'd
-            # be getting smaller gradients for states in long sequences.
-            backprop(d_scores)
-            # Follow the predicted action
-            self.transition_states(states, scores)
-            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
-            if max_moves >= 1 and n_moves >= max_moves:
-                break
-            n_moves += 1
+            init_states, gold_states, _ = self.moves.init_gold_batch(examples)
 
-        backprop_tok2vec(golds)
+        inputs = TransitionModelInputs(docs=docs, moves=self.moves,
+            max_moves=max_moves, states=[state.copy() for state in init_states])
+        (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
+        if sum(s.shape[0] for s in scores) == 0:
+            return losses
+        d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
+            examples, max_moves)
+        backprop_scores((pred_states, d_scores))
         if sgd not in (None, False):
             self.finish_update(sgd)
+        losses[self.name] += float((d_scores**2).sum())
         # Ugh, this is annoying. If we're working on GPU, we want to free the
         # memory ASAP. It seems that Python doesn't necessarily get around to
         # removing these in time if we don't explicitly delete? It's confusing.
-        del backprop
-        del backprop_tok2vec
-        model.clear_memory()
-        del model
+        del backprop_scores
         return losses
 
+    def get_loss(self, states_scores, examples, max_moves):
+        gold_states, init_states, pred_states, scores = states_scores
+        scores = self.model.ops.xp.vstack(scores)
+        costs = self._get_costs_from_histories(
+            examples,
+            gold_states,
+            init_states,
+            [list(state.history) for state in pred_states],
+            max_moves
+        )
+        xp = get_array_module(scores)
+        best_costs = costs.min(axis=1, keepdims=True)
+        gscores = scores.copy()
+        min_score = scores.min() - 1000
+        assert costs.shape == scores.shape, (costs.shape, scores.shape)
+        gscores[costs > best_costs] = min_score
+        max_ = scores.max(axis=1, keepdims=True)
+        gmax = gscores.max(axis=1, keepdims=True)
+        exp_scores = xp.exp(scores - max_)
+        exp_gscores = xp.exp(gscores - gmax)
+        Z = exp_scores.sum(axis=1, keepdims=True)
+        gZ = exp_gscores.sum(axis=1, keepdims=True)
+        d_scores = exp_scores / Z
+        d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
+        return d_scores
+
+    def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
+        cdef TransitionSystem moves = self.moves
+        cdef StateClass state
+        cdef int clas
+        cdef int nF = self.model.get_dim("nF")
+        cdef int nO = moves.n_moves
+        cdef int nS = sum([len(history) for history in histories])
+        cdef Pool mem = Pool()
+        cdef np.ndarray costs_i
+        is_valid = <int*>mem.alloc(nO, sizeof(int))
+        batch = list(zip(init_states, histories, gold_states))
+        n_moves = 0
+        output = []
+        while batch:
+            costs = numpy.zeros((len(batch), nO), dtype="f")
+            for i, (state, history, gold) in enumerate(batch):
+                costs_i = costs[i]
+                clas = history.pop(0)
+                moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
+                action = moves.c[clas]
+                action.do(state.c, action.label)
+                state.c.history.push_back(clas)
+            output.append(costs)
+            batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
+            if n_moves >= max_moves >= 1:
+                break
+            n_moves += 1
+
+        return self.model.ops.xp.vstack(output)
+
     def rehearse(self, examples, sgd=None, losses=None, **cfg):
         """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
         if losses is None:
@@ -596,10 +536,9 @@ cdef class Parser(TrainablePipe):
                 multitask.rehearse(examples, losses=losses, sgd=sgd)
         if self._rehearsal_model is None:
             return None
-        losses.setdefault(self.name, 0.)
+        losses.setdefault(self.name, 0.0)
         validate_examples(examples, "Parser.rehearse")
         docs = [eg.predicted for eg in examples]
-        states = self.moves.init_batch(docs)
         # This is pretty dirty, but the NER can resize itself in init_batch,
         # if labels are missing. We therefore have to check whether we need to
         # expand our model output.
@@ -607,85 +546,33 @@ cdef class Parser(TrainablePipe):
         # Prepare the stepwise model, and get the callback for finishing the batch
         set_dropout_rate(self._rehearsal_model, 0.0)
         set_dropout_rate(self.model, 0.0)
-        tutor, _ = self._rehearsal_model.begin_update(docs)
-        model, backprop_tok2vec = self.model.begin_update(docs)
-        n_scores = 0.
-        loss = 0.
-        while states:
-            targets, _ = tutor.begin_update(states)
-            guesses, backprop = model.begin_update(states)
-            d_scores = (guesses - targets) / targets.shape[0]
-            # If all weights for an output are 0 in the original model, don't
-            # supervise that output. This allows us to add classes.
-            loss += (d_scores**2).sum()
-            backprop(d_scores)
-            # Follow the predicted action
-            self.transition_states(states, guesses)
-            states = [state for state in states if not state.is_final()]
-            n_scores += d_scores.size
-        # Do the backprop
-        backprop_tok2vec(docs)
-        if sgd is not None:
-            self.finish_update(sgd)
-        losses[self.name] += loss / n_scores
-        del backprop
-        del backprop_tok2vec
-        model.clear_memory()
-        tutor.clear_memory()
-        del model
-        del tutor
-        return losses
+        student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
+        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
+        actions = states2actions(student_states)
+        teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
+        _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
+
+        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
+
+        teacher_scores = self.model.ops.xp.vstack(teacher_scores)
+        student_scores = self.model.ops.xp.vstack(student_scores)
+        assert teacher_scores.shape == student_scores.shape
+
+        d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
+        # If all weights for an output are 0 in the original model, don't
+        # supervise that output. This allows us to add classes.
+        loss = (d_scores**2).sum() / d_scores.size
+        backprop_scores((student_states, d_scores))
 
-    def update_beam(self, examples, *, beam_width, drop=0., sgd=None,
-                    losses=None, beam_density=0.0):
-        states, golds, _ = self.moves.init_gold_batch(examples)
-        if not states:
-            return losses
-        # Prepare the stepwise model, and get the callback for finishing the batch
-        model, backprop_tok2vec = self.model.begin_update(
-            [eg.predicted for eg in examples])
-        loss = _beam_utils.update_beam(
-            self.moves,
-            states,
-            golds,
-            model,
-            beam_width,
-            beam_density=beam_density,
-        )
-        losses[self.name] += loss
-        backprop_tok2vec(golds)
         if sgd is not None:
             self.finish_update(sgd)
+        losses[self.name] += loss
 
-    def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
-        cdef StateClass state
-        cdef Pool mem = Pool()
-        cdef int i
-
-        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
-
-        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
-        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
-        cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
-                                               dtype='f', order='C')
-        c_d_scores = <float*>d_scores.data
-        unseen_classes = self.model.attrs["unseen_classes"]
-        for i, (state, gold) in enumerate(zip(states, golds)):
-            memset(is_valid, 0, self.moves.n_moves * sizeof(int))
-            memset(costs, 0, self.moves.n_moves * sizeof(float))
-            self.moves.set_costs(is_valid, costs, state.c, gold)
-            for j in range(self.moves.n_moves):
-                if costs[j] <= 0.0 and j in unseen_classes:
-                    unseen_classes.remove(j)
-            cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0],
-                         d_scores.shape[1])
-            c_d_scores += d_scores.shape[1]
-        # Note that we don't normalize this. See comment in update() for why.
-        if losses is not None:
-            losses.setdefault(self.name, 0.)
-            losses[self.name] += (d_scores**2).sum()
-        return d_scores
+        return losses
+
+    def update_beam(self, examples, *, beam_width,
+            drop=0., sgd=None, losses=None, beam_density=0.0):
+        raise NotImplementedError
 
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
@@ -724,7 +611,7 @@ cdef class Parser(TrainablePipe):
             for example in islice(get_examples(), 10):
                 doc_sample.append(example.predicted)
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize(doc_sample)
+        self.model.initialize((doc_sample, self.moves))
         if nlp is not None:
             self.init_multitask_objectives(get_examples, nlp.pipeline)
 
@@ -816,26 +703,27 @@ cdef class Parser(TrainablePipe):
 
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
-        sequence or a cap. A long
-        doc will get multiple states. Let's say we have a doc of length 2*N,
-        where N is the shortest doc. We'll make two states, one representing
-        long_doc[:N], and another representing long_doc[N:]."""
+        sequence or a cap. A long doc will get multiple states. Let's say we
+        have a doc of length 2*N, where N is the shortest doc. We'll make
+        two states, one representing long_doc[:N], and another representing
+        long_doc[N:]."""
         cdef:
             StateClass start_state
             StateClass state
             Transition action
-        all_states = self.moves.init_batch([eg.predicted for eg in examples])
+            TransitionSystem moves = self.moves
+        all_states = moves.init_batch([eg.predicted for eg in examples])
         states = []
         golds = []
         to_cut = []
         for state, eg in zip(all_states, examples):
-            if self.moves.has_gold(eg) and not state.is_final():
-                gold = self.moves.init_gold(state, eg)
+            if moves.has_gold(eg) and not state.is_final():
+                gold = moves.init_gold(state, eg)
                 if len(eg.x) < max_length:
                     states.append(state)
                     golds.append(gold)
                 else:
-                    oracle_actions = self.moves.get_oracle_sequence_from_state(
+                    oracle_actions = moves.get_oracle_sequence_from_state(
                         state.copy(), gold)
                     to_cut.append((eg, state, gold, oracle_actions))
         if not to_cut:
@@ -845,13 +733,52 @@ cdef class Parser(TrainablePipe):
             for i in range(0, len(oracle_actions), max_length):
                 start_state = state.copy()
                 for clas in oracle_actions[i:i+max_length]:
-                    action = self.moves.c[clas]
+                    action = moves.c[clas]
                     action.do(state.c, action.label)
                     if state.is_final():
                         break
-                if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
+                if moves.has_gold(eg, start_state.B(0), state.B(0)):
                     states.append(start_state)
                     golds.append(gold)
                 if state.is_final():
                     break
         return states, golds, max_length
+
+
+@contextlib.contextmanager
+def _change_attrs(model, **kwargs):
+    """Temporarily modify a thinc model's attributes."""
+    unset = object()
+    old_attrs = {}
+    for key, value in kwargs.items():
+        old_attrs[key] = model.attrs.get(key, unset)
+        model.attrs[key] = value
+    yield model
+    for key, value in old_attrs.items():
+        if value is unset:
+            model.attrs.pop(key)
+        else:
+            model.attrs[key] = value
+
+
+def states2actions(states: List[StateClass]) -> List[Ints1d]:
+    cdef int step
+    cdef StateClass state
+    cdef StateC* c_state
+    actions = []
+    while True:
+        step = len(actions)
+
+        step_actions = []
+        for state in states:
+            c_state = state.c
+            if step < c_state.history.size():
+                step_actions.append(c_state.history[step])
+
+        # We are done if we have exhausted all histories.
+        if len(step_actions) == 0:
+            break
+
+        actions.append(numpy.array(step_actions, dtype="i"))
+
+    return actions
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index b6848d380fe..4cdaba2fe32 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,6 +17,8 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
+from thinc.api import fix_random_seed
+import logging
 
 from ..util import make_tempdir
 
@@ -413,7 +415,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -540,11 +542,11 @@ def test_block_ner():
     assert [token.ent_type_ for token in doc] == expected_types
 
 
-@pytest.mark.parametrize("use_upper", [True, False])
-def test_overfitting_IO(use_upper):
+def test_overfitting_IO():
+    fix_random_seed(1)
     # Simple test to try and quickly overfit the NER component
     nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
+    ner = nlp.add_pipe("ner", config={"model": {}})
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -576,7 +578,6 @@ def test_overfitting_IO(use_upper):
         assert ents2[0].label_ == "LOC"
         # Ensure that the predictions are still the same, even after adding a new label
         ner2 = nlp2.get_pipe("ner")
-        assert ner2.model.attrs["has_upper"] == use_upper
         ner2.add_label("RANDOM_NEW_LABEL")
         doc3 = nlp2(test_text)
         ents3 = doc3.ents
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 42cf5ced998..71acb51d455 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,16 +1,17 @@
+import itertools
 import pytest
+import numpy
 from numpy.testing import assert_equal
 from thinc.api import Adam, fix_random_seed
 
 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.pipeline import DependencyParser
-from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
-from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
-from spacy.tokens import Doc
 from spacy.training import Example
+from spacy.tokens import Doc
 from spacy.vocab import Vocab
+from spacy import util, registry
+from thinc.api import fix_random_seed
 
 from ..util import apply_transition_sequence, make_tempdir
 
@@ -59,6 +60,8 @@
     ),
 ]
 
+PARSERS = ["parser"]  # TODO: Test beam_parser when ready
+
 eps = 0.1
 
 
@@ -171,6 +174,57 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
     assert doc[0].dep != 0
 
 
+def test_parser_apply_actions(en_vocab, en_parser):
+    words = ["I", "ate", "pizza"]
+    words2 = ["Eat", "more", "pizza", "!"]
+    doc1 = Doc(en_vocab, words=words)
+    doc2 = Doc(en_vocab, words=words2)
+    docs = [doc1, doc2]
+
+    moves = en_parser.moves
+    moves.add_action(0, "")
+    moves.add_action(1, "")
+    moves.add_action(2, "nsubj")
+    moves.add_action(3, "obj")
+    moves.add_action(2, "amod")
+
+    actions = [
+        numpy.array([0, 0], dtype="i"),
+        numpy.array([2, 0], dtype="i"),
+        numpy.array([0, 4], dtype="i"),
+        numpy.array([3, 3], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([0], dtype="i"),
+        numpy.array([1], dtype="i"),
+    ]
+
+    states = moves.init_batch(docs)
+    active_states = states
+
+    for step_actions in actions:
+        active_states = moves.apply_actions(active_states, step_actions)
+
+    assert len(active_states) == 0
+
+    for (state, doc) in zip(states, docs):
+        moves.set_annotations(state, doc)
+
+    assert docs[0][0].head.i == 1
+    assert docs[0][0].dep_ == "nsubj"
+    assert docs[0][1].head.i == 1
+    assert docs[0][1].dep_ == "ROOT"
+    assert docs[0][2].head.i == 1
+    assert docs[0][2].dep_ == "obj"
+
+    assert docs[1][0].head.i == 0
+    assert docs[1][0].dep_ == "ROOT"
+    assert docs[1][1].head.i == 2
+    assert docs[1][1].dep_ == "amod"
+    assert docs[1][2].head.i == 0
+    assert docs[1][2].dep_ == "obj"
+
+
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
@@ -319,7 +373,7 @@ def test_parser_constructor(en_vocab):
     DependencyParser(en_vocab, model)
 
 
-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+@pytest.mark.parametrize("pipe_name", PARSERS)
 def test_incomplete_data(pipe_name):
     # Test that the parser works with incomplete information
     nlp = English()
@@ -345,11 +399,15 @@ def test_incomplete_data(pipe_name):
     assert doc[2].head.i == 1
 
 
-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
-def test_overfitting_IO(pipe_name):
+@pytest.mark.parametrize(
+    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
+)
+def test_overfitting_IO(pipe_name, max_moves):
+    fix_random_seed(0)
     # Simple test to try and quickly overfit the dependency parser (normal or beam)
     nlp = English()
     parser = nlp.add_pipe(pipe_name)
+    parser.cfg["update_with_oracle_cut_size"] = max_moves
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -451,10 +509,12 @@ def test_distill():
 @pytest.mark.parametrize(
     "parser_config",
     [
-        # TransitionBasedParser V1
-        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        # TransitionBasedParser V2
+        # TODO: re-enable after we have a spacy-legacy release for v4. See
+        # https://github.com/explosion/spacy-legacy/pull/36
+        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
         ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
+        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
     ],
 )
 # fmt: on
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index c3c4bb6c686..f6cefbc1f84 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -384,7 +384,7 @@ def test_replace_listeners():
     factory = "ner"
 
     [components.ner.model]
-    @architectures = "spacy.TransitionBasedParser.v2"
+    @architectures = "spacy.TransitionBasedParser.v3"
 
     [components.ner.model.tok2vec]
     @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 8a1c74ca9ed..b351ea80121 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -189,33 +189,11 @@
 
 parser_config_string_upper = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
-use_upper = true
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = null
-width = 333
-depth = 4
-embed_size = 5555
-window_size = 1
-maxout_pieces = 7
-subword_features = false
-"""
-
-
-parser_config_string_no_upper = """
-[model]
-@architectures = "spacy.TransitionBasedParser.v2"
-state_type = "parser"
-extra_state_tokens = false
-hidden_width = 66
-maxout_pieces = 2
-use_upper = false
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
@@ -246,7 +224,6 @@ def my_parser():
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
-        use_upper=True,
     )
     return parser
 
@@ -360,15 +337,16 @@ def test_serialize_custom_nlp():
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        assert model.get_ref("upper").get_dim("nI") == 65
-        assert model.get_ref("lower").get_dim("nI") == 65
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("W")
+        assert output.has_param("b")
 
 
-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_serialize_parser(parser_config_string):
     """Create a non-default parser config to check nlp serializes it correctly"""
     nlp = English()
@@ -381,11 +359,13 @@ def test_serialize_parser(parser_config_string):
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        if model.attrs["has_upper"]:
-            assert model.get_ref("upper").get_dim("nI") == 66
-        assert model.get_ref("lower").get_dim("nI") == 66
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("b")
+        assert output.has_param("W")
 
 
 def test_config_nlp_roundtrip():
@@ -581,9 +561,7 @@ def test_config_auto_fill_extra_fields():
     load_model_from_config(nlp.config)
 
 
-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_config_validate_literal(parser_config_string):
     nlp = English()
     config = Config().from_str(parser_config_string)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index d2a41ff0fed..160e2ecf335 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,24 +1,13 @@
 import ctypes
 import os
 from pathlib import Path
-
-import pytest
-
-try:
-    from pydantic.v1 import ValidationError
-except ImportError:
-    from pydantic import ValidationError  # type: ignore
-
-from thinc.api import (
-    Config,
-    ConfigValidationError,
-    CupyOps,
-    MPSOps,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-    set_current_ops,
-)
+from spacy.about import __version__ as spacy_version
+from spacy import util
+from spacy import prefer_gpu, require_gpu, require_cpu
+from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
+from spacy.util import find_available_port
+from thinc.api import Config, Optimizer, ConfigValidationError
+from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
@@ -101,34 +90,6 @@ def test_util_get_package_path(package):
     assert isinstance(path, Path)
 
 
-def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
-    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
-    assert model.get_param("W").shape == (nF, nO, nP, nI)
-    tensor = model.ops.alloc((10, nI))
-    Y, get_dX = model.begin_update(tensor)
-    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
-    dY = model.ops.alloc((15, nO, nP))
-    ids = model.ops.alloc((15, nF))
-    ids[1, 2] = -1
-    dY[1] = 1
-    assert not model.has_grad("pad")
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 2, 0, 0] == 1.0
-    ids.fill(0.0)
-    dY.fill(0.0)
-    dY[0] = 0
-    ids[1, 2] = 0
-    ids[1, 1] = -1
-    ids[1, 0] = -1
-    dY[1] = 1
-    ids[2, 0] = -1
-    dY[2] = 5
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 0, 0, 0] == 6
-    assert d_pad[0, 1, 0, 0] == 1
-    assert d_pad[0, 2, 0, 0] == 0
-
-
 def test_prefer_gpu():
     current_ops = get_current_ops()
     if has_cupy_gpu:
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index d121c9aa56f..3b3df2115e8 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,5 +1,4 @@
 from collections.abc import Iterable as IterableInstance
-
 import numpy
 
 from murmurhash.mrmr cimport hash64
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 956234ac0d4..db8f974ea19 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -833,18 +833,17 @@ for a Tok2Vec layer.
 
 ## Parser & NER architectures {id="parser"}
 
-### spacy.TransitionBasedParser.v2 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
+### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TransitionBasedParser.v2"
+> @architectures = "spacy.TransitionBasedParser.v3"
 > state_type = "ner"
 > extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
-> use_upper = true
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v2"
@@ -874,23 +873,22 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
-| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
-| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
-| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
-| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
-| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
-| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
+| Name                 | Description                                                                                                                                                       |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                        |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                               |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                            |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~                                                             |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                       |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                     |
 
 <Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>
 
 [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
 same signature, but the `use_upper` argument was `True` by default.
 
-</Accordion>
+ </Accordion>
 
 ## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 4a11efbaa0f..e5bee209281 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -406,7 +406,7 @@ Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v2
+Name       spacy.TransitionBasedParser.v3
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
@@ -416,7 +416,7 @@ Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v2
+Name       spacy.TransitionBasedParser.v3
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]
@@ -741,7 +741,7 @@ scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 - hidden_width = 64
@@ -764,7 +764,7 @@ scorer = {"@scorers":"spacy.parser_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx
index b44df538766..44c80622437 100644
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@@ -302,7 +302,7 @@ the others, but may not be as accurate, especially if texts are short.
 ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 
 Identical to
-[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
+[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser)
 except the `use_upper` was set to `True` by default.
 
 ## Layers {id="layers"}
diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 534cf478087..1b0bc9606e9 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -140,7 +140,7 @@ factory = "tok2vec"
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -156,7 +156,7 @@ same. This makes them fully independent and doesn't require an upstream
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
@@ -472,7 +472,7 @@ sneakily delegates to the `Transformer` pipeline component.
 factory = "ner"
 
 [nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 128

From 388f4b456bc3a056d6fa0b6267fa5da7b01c5ba2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 18 Jan 2023 18:28:30 +0100
Subject: [PATCH 362/504] Fix batching regression (#12094)

* Fix batching regression

Some time ago, the spaCy v4 branch switched to the new Thinc v9
schedule. However, this introduced an error in how batching is handed.

In the PR, the batchers were changed to keep track of their step,
so that the step can be passed to the schedule. However, the issue
is that the training loop repeatedly calls the batching functions
(rather than using an infinite generator/iterator). So, the step and
therefore the schedule would be reset each epoch. Before the schedule
switch we didn't have this issue, because the old schedules were
stateful.

This PR fixes this issue by reverting the batching functions to use
a (stateful) generator. Their registry functions do accept a `Schedule`
and we convert `Schedule`s to generators.

* Update batcher docs

* Docstring fixes

* Make minibatch take iterables again as well

* Bump thinc requirement to 9.0.0.dev2

* Use type declaration

* Convert another comment into a proper type declaration
---
 pyproject.toml             |  2 +-
 requirements.txt           |  2 +-
 setup.cfg                  |  2 +-
 spacy/training/batchers.py | 28 ++++++++++++----------------
 spacy/util.py              |  6 +++---
 5 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 58e4382e829..3891d137867 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev1,<9.1.0",
+    "thinc>=9.0.0.dev2,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 0325dda2ee9..eaa204462df 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev1,<9.1.0
+thinc>=9.0.0.dev2,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 2c1a0591724..3497f989480 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,7 +37,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev1,<9.1.0
+    thinc>=9.0.0.dev2,<9.1.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index 9557ffb4eca..22d3e42dc13 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,11 +1,12 @@
 import itertools
-from thinc.schedules import Schedule, constant as constant_schedule
+from thinc.schedules import Schedule
 
 from thinc.schedules import Schedule
 
 from ..util import minibatch, registry
 
-Sizing = Union[Sequence[int], int, Schedule[int]]
+SizingSchedule = Union[Iterable[int], int, Schedule]
+Sizing = Union[Iterable[int], int]
 ItemT = TypeVar("ItemT")
 BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
 
@@ -110,13 +111,12 @@ def minibatch_by_padded_size(
         The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_: Iterator[int] = itertools.repeat(size)
     else:
-        assert isinstance(size, Schedule)
-        size_ = size
-    for step, outer_batch in enumerate(minibatch(seqs, size=buffer)):
+        size_ = iter(size)
+    for outer_batch in minibatch(seqs, size=buffer):
         outer_batch = list(outer_batch)
-        target_size = size_(step)
+        target_size = next(size_)
         for indices in _batch_by_length(outer_batch, target_size, get_length):
             subbatch = [outer_batch[i] for i in indices]
             padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
@@ -147,12 +147,10 @@ def minibatch_by_words(
         item. The `len` function is used by default.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_: Iterator[int] = itertools.repeat(size)
     else:
-        assert isinstance(size, Schedule)
-        size_ = size
-    step = 0
-    target_size = size_(step)
+        size_ = iter(size)
+    target_size = next(size_)
     tol_size = target_size * tolerance
     batch = []
     overflow = []
@@ -177,8 +175,7 @@ def minibatch_by_words(
         else:
             if batch:
                 yield batch
-            step += 1
-            target_size = size_(step)
+            target_size = next(size_)
             tol_size = target_size * tolerance
             batch = overflow
             batch_size = overflow_size
@@ -196,8 +193,7 @@ def minibatch_by_words(
             else:
                 if batch:
                     yield batch
-                step += 1
-                target_size = size_(step)
+                target_size = next(size_)
                 tol_size = target_size * tolerance
                 batch = [seq]
                 batch_size = n_words
diff --git a/spacy/util.py b/spacy/util.py
index a76e8f73eeb..20d7cbb5726 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1622,12 +1622,12 @@ def minibatch(items, size):
     so that batch-size can vary on each step.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_ = itertools.repeat(size)
     else:
         size_ = iter(size)
     items = iter(items)
-    for step in itertools.count():
-        batch_size = size_(step)
+    while True:
+        batch_size = next(size_)
         batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break

From f9ec63eeb3bb9da67c17107e3284ec049a40feb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 19 Jan 2023 09:25:34 +0100
Subject: [PATCH 363/504] Set version to v4.0.0.dev0 (#12126)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index ef80718fee0..eb85e6af388 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "4.0.0.dev2"
+__version__ = "4.0.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 799ab9da93d60052769fb90a0ce6993c8c1688e6 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 08:29:46 +0100
Subject: [PATCH 364/504] Format

---
 spacy/pipeline/edit_tree_lemmatizer.py |  2 +-
 spacy/pipeline/entity_linker.py        | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 8637dc077fa..d5366d7d10f 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -173,7 +173,7 @@ def get_teacher_student_loss(
         student_scores: Scores representing the student model's predictions.
 
         RETURNS (Tuple[float, float]): The loss and the gradient.
-        
+
         DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
         """
         loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 2716d3821e2..5ecf7f6e272 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -477,7 +477,11 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         docs_ents: List[Ragged] = []
         docs_scores: List[Ragged] = []
         if not docs:
-            return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
+            return {
+                KNOWLEDGE_BASE_IDS: final_kb_ids,
+                "ents": docs_ents,
+                "scores": docs_scores,
+            }
         if isinstance(docs, Doc):
             docs = [docs]
         for doc in docs:
@@ -584,7 +588,11 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                 method="predict", msg="result variables not of equal length"
             )
             raise RuntimeError(err)
-        return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
+        return {
+            KNOWLEDGE_BASE_IDS: final_kb_ids,
+            "ents": docs_ents,
+            "scores": docs_scores,
+        }
 
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:

From 11290c315c8eaf2894cb5f160f8398c39d6d7831 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 08:37:02 +0100
Subject: [PATCH 365/504] CI: Skip tests that require published pipelines

---
 .github/azure-steps.yml | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index c7722391fec..fc83d4994b4 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -64,12 +64,17 @@ steps:
     displayName: "Run GPU tests"
     condition: eq(${{ parameters.gpu }}, true)
 
-  - script: |
-      python -m spacy download ca_core_news_sm
-      python -m spacy download ca_core_news_md
-      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-    displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.8')
+#  - script: |
+#      python -m spacy download ca_core_news_sm
+#      python -m spacy download ca_core_news_md
+#      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+#    displayName: 'Test download CLI'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+#    displayName: 'Test no warnings on load (#11713)'
+#    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@@ -93,17 +98,17 @@ steps:
     displayName: 'Test train CLI'
     condition: eq(variables['python_version'], '3.8')
 
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-    displayName: 'Test assemble CLI'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-    displayName: 'Test assemble CLI vectors warning'
-    condition: eq(variables['python_version'], '3.8')
+#  - script: |
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+#      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+#    displayName: 'Test assemble CLI'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+#      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+#    displayName: 'Test assemble CLI vectors warning'
+#    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python .github/validate_universe_json.py website/meta/universe.json

From 9c9ac99b013f19bb67c50e6d20b4df194bbec05c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 15:48:20 +0100
Subject: [PATCH 366/504] Drop python 3.6/3.7, remove unneeded compat (#12187)

* Drop python 3.6/3.7, remove unneeded compat

* Remove unused import

* Minimal python 3.8+ docs updates
---
 azure-pipelines.yml             | 20 +----------
 requirements.txt                |  4 +--
 setup.cfg                       |  4 +--
 spacy/cli/_util.py              | 10 ++++++
 spacy/cli/debug_data.py         |  8 +++++
 spacy/errors.py                 |  3 +-
 spacy/language.py               | 59 +++++++++++++++------------------
 spacy/matcher/matcher.pyi       | 17 ++--------
 spacy/matcher/phrasematcher.pyi |  6 ++--
 spacy/ml/models/parser.py       |  5 +--
 spacy/pipeline/spancat.py       |  2 +-
 spacy/schemas.py                |  9 +++++
 spacy/ty.py                     | 16 ++-------
 spacy/util.py                   |  2 +-
 14 files changed, 72 insertions(+), 93 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 0f7ea91f96f..99f1b8afffe 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -29,7 +29,7 @@ jobs:
     steps:
       - task: UsePythonVersion@0
         inputs:
-          versionSpec: "3.7"
+          versionSpec: "3.8"
       - script: |
           pip install flake8==5.0.4
           python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
@@ -40,24 +40,6 @@ jobs:
     strategy:
       matrix:
         # We're only running one platform per Python version to speed up builds
-        Python36Linux:
-          imageName: "ubuntu-20.04"
-          python.version: "3.6"
-        #        Python36Windows:
-        #          imageName: "windows-latest"
-        #          python.version: "3.6"
-        #        Python36Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.6"
-        #        Python37Linux:
-        #          imageName: "ubuntu-20.04"
-        #          python.version: "3.7"
-        Python37Windows:
-          imageName: "windows-latest"
-          python.version: "3.7"
-        #        Python37Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.7"
         #        Python38Linux:
         #          imageName: "ubuntu-latest"
         #          python.version: "3.8"
diff --git a/requirements.txt b/requirements.txt
index eaa204462df..a68c159d643 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=4.0.0.dev1,<4.1.0
+spacy-legacy>=4.0.0.dev0,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
@@ -30,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
+mypy>=0.990,<0.1000; platform_machine != "aarch64"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
diff --git a/setup.cfg b/setup.cfg
index 3497f989480..3a84f37d3bf 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -29,10 +29,10 @@ project_urls =
 [options]
 zip_safe = false
 include_package_data = true
-python_requires = >=3.6
+python_requires = >=3.8
 install_requires =
     # Our libraries
-    spacy-legacy>=4.0.0.dev1,<4.1.0
+    spacy-legacy>=4.0.0.dev0,<4.1.0
     spacy-loggers>=1.0.0,<2.0.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index eed61119070..977912443bd 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,3 +1,10 @@
+from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
+from typing import TYPE_CHECKING, overload
+import sys
+import shutil
+from pathlib import Path
+from wasabi import msg, Printer
+import srsly
 import hashlib
 import os
 import shutil
@@ -27,6 +34,9 @@
 from typer.main import get_command
 from wasabi import Printer, msg
 
+from ..schemas import ProjectConfigSchema, validate
+from ..util import import_file, run_command, make_tempdir, registry, logger
+from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
 from .. import about
 from ..errors import RENAMED_LANGUAGE_CODES
 from ..schemas import ProjectConfigSchema, validate
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 7a98e6d563c..60f760ccb52 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,3 +1,11 @@
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
+from typing import Literal, cast, overload
+from pathlib import Path
+from collections import Counter
+import sys
+import srsly
+from wasabi import Printer, MESSAGES, msg
+import typer
 import math
 import sys
 from collections import Counter
diff --git a/spacy/errors.py b/spacy/errors.py
index 245c89aa582..dd64465cd91 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,8 +1,7 @@
+from typing import Literal
 import warnings
 from typing import Literal
 
-from . import about
-
 
 class ErrorsWithCodes(type):
     def __getattribute__(self, code):
diff --git a/spacy/language.py b/spacy/language.py
index 72d27c598cc..1c80f33306c 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,3 +1,10 @@
+from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Literal
+from typing import Union, Tuple, List, Set, Pattern, Sequence
+from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
+
+from dataclasses import dataclass
+import random
+import itertools
 import functools
 import itertools
 import multiprocessing as mp
@@ -30,41 +37,29 @@
     overload,
 )
 
-import srsly
-from thinc.api import Config, CupyOps, Optimizer, get_current_ops
-
-from . import about, ty, util
+from . import ty
+from .tokens.underscore import Underscore
+from .vocab import Vocab, create_vocab
+from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
+from .training import Example, validate_examples
+from .training.initialize import init_vocab, init_tok2vec
+from .scorer import Scorer
+from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
+from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
+from .util import warn_if_jupyter_cupy
+from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
+from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .lang.punctuation import TOKENIZER_INFIXES
+from .tokens import Doc
+from .tokenizer import Tokenizer
 from .errors import Errors, Warnings
+from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
+from .schemas import ConfigSchemaPretrain, validate_init_settings
 from .git_info import GIT_VERSION
-from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
-from .lang.tokenizer_exceptions import BASE_EXCEPTIONS, URL_MATCH
+from . import util
+from . import about
 from .lookups import load_lookups
-from .pipe_analysis import analyze_pipes, print_pipe_analysis, validate_attrs
-from .schemas import (
-    ConfigSchema,
-    ConfigSchemaInit,
-    ConfigSchemaNlp,
-    ConfigSchemaPretrain,
-    validate_init_settings,
-)
-from .scorer import Scorer
-from .tokenizer import Tokenizer
-from .tokens import Doc
-from .tokens.underscore import Underscore
-from .training import Example, validate_distillation_examples, validate_examples
-from .training.initialize import init_tok2vec, init_vocab
-from .util import (
-    _DEFAULT_EMPTY_PIPES,
-    CONFIG_SECTION_ORDER,
-    SimpleFrozenDict,
-    SimpleFrozenList,
-    _pipe,
-    combine_score_weights,
-    raise_error,
-    registry,
-    warn_if_jupyter_cupy,
-)
-from .vocab import Vocab, create_vocab
+
 
 PipeCallable = Callable[[Doc], Doc]
 
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index fe2d8bec3bc..a0b6d91e7d5 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -1,17 +1,6 @@
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    Iterator,
-    List,
-    Literal,
-    Optional,
-    Tuple,
-    Union,
-    overload,
-)
-
+from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Literal
+from typing import Iterator, Iterable, overload
+from ..vocab import Vocab
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index d3c679a65d5..45685db228a 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -1,5 +1,7 @@
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, overload
-
+from typing import List, Tuple, Union, Optional, Callable, Any, Dict, Literal
+from typing import overload
+from .matcher import Matcher
+from ..vocab import Vocab
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 from .matcher import Matcher
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 59483839206..01312983d86 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,12 +1,9 @@
-from typing import Optional, List, Tuple, Any
+from typing import Optional, List, Tuple, Any, Literal
 from thinc.types import Floats2d
 from thinc.api import Model
 import warnings
 
 from ...errors import Errors, Warnings
-from ...compat import Literal
-from ...errors import Errors
-from ...tokens import Doc
 from ...util import registry
 from ..tb_framework import TransitionModel
 from ...tokens.doc import Doc
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index d800a4d484b..c2e4d29d065 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,5 +1,5 @@
 from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
-from typing import Union
+from typing import Union, Protocol, runtime_checkable
 from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
 from thinc.api import Optimizer
 from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 4372e3f5e2e..49bd4fe93e2 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,3 +1,12 @@
+from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
+from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
+from enum import Enum
+from pydantic import BaseModel, Field, ValidationError, validator, create_model
+from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
+from pydantic.main import ModelMetaclass
+from thinc.api import Optimizer, ConfigValidationError, Model
+from thinc.config import Promise
+from collections import defaultdict
 import inspect
 import re
 from collections import defaultdict
diff --git a/spacy/ty.py b/spacy/ty.py
index e4f34a5f651..ac09cb336ac 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -1,17 +1,5 @@
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Protocol,
-    Sequence,
-    runtime_checkable,
-)
-
-from thinc.api import Model, Optimizer
+from typing import TYPE_CHECKING, Protocol, runtime_checkable
+from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
 
 if TYPE_CHECKING:
     from .language import Language
diff --git a/spacy/util.py b/spacy/util.py
index 20d7cbb5726..ae1135234e3 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -65,7 +65,7 @@
 
 
 from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows, importlib_metadata
+from .compat import cupy, CudaStream, is_windows
 from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, is_windows

From 8616a91829e0411ff11873e1447c3dc903d19dcf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 30 Jan 2023 12:44:11 +0100
Subject: [PATCH 367/504] Add `Language.distill` (#12116)

* Add `Language.distill`

This method is the distillation counterpart of `Language.update`.  It
takes a teacher `Language` instance and distills the student pipes on
the teacher pipes.

* Apply suggestions from code review

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Clarify that how Example is used in distillation

* Update transition parser distill docstring for examples argument

* Pass optimizer to `TrainablePipe.distill`

* Annotate pipe before update

As discussed internally, we want to let a pipe annotate before doing an
update with gold/silver data. Otherwise, the output may be (too)
informed by the gold/silver data.

* Rename `component_map` to `student_to_teacher`

* Better synopsis in `Language.distill` docstring

* `name` -> `student_name`

* Fix labels type in docstring

* Mark distill test as slow

* Fix `student_to_teacher` type in docs

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/language.py                       | 30 +++++++------------------
 spacy/pipeline/trainable_pipe.pyx       |  4 ++--
 spacy/pipeline/transition_parser.pyx    |  4 ++--
 spacy/tests/test_language.py            |  6 +++++
 website/docs/api/dependencyparser.mdx   | 18 +++++++--------
 website/docs/api/edittreelemmatizer.mdx | 18 +++++++--------
 website/docs/api/entityrecognizer.mdx   | 18 +++++++--------
 website/docs/api/language.mdx           | 28 +++++++++++++++++++++++
 website/docs/api/morphologizer.mdx      | 18 +++++++--------
 website/docs/api/pipe.mdx               | 18 +++++++--------
 website/docs/api/sentencerecognizer.mdx | 18 +++++++--------
 website/docs/api/tagger.mdx             | 18 +++++++--------
 12 files changed, 109 insertions(+), 89 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 1c80f33306c..f44631c90a9 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -41,7 +41,7 @@
 from .tokens.underscore import Underscore
 from .vocab import Vocab, create_vocab
 from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .training import Example, validate_examples
+from .training import Example, validate_examples, validate_distillation_examples
 from .training.initialize import init_vocab, init_tok2vec
 from .scorer import Scorer
 from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
@@ -1049,7 +1049,7 @@ def distill(
         examples: Iterable[Example],
         *,
         drop: float = 0.0,
-        sgd: Union[Optimizer, None, Literal[False]] = None,
+        sgd: Optional[Optimizer] = None,
         losses: Optional[Dict[str, float]] = None,
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
         exclude: Iterable[str] = SimpleFrozenList(),
@@ -1062,9 +1062,7 @@ def distill(
             (teacher) and predicted (student) docs must have the same number of
             tokens and the same orthography.
         drop (float): The dropout rate.
-        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
-            be created via create_optimizer if 'None'. No optimizer will
-            be used when set to 'False'.
+        sgd (Optional[Optimizer]): An optimizer.
         losses (Optional(Dict[str, float])): Dictionary to update with the loss,
             keyed by component.
         component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
@@ -1087,7 +1085,7 @@ def distill(
             return losses
 
         validate_distillation_examples(examples, "Language.distill")
-        examples = _copy_examples(examples, copy_x=True, copy_y=True)
+        examples = _copy_examples(examples)
 
         if sgd is None:
             if self._optimizer is None:
@@ -1134,23 +1132,11 @@ def distill(
                 student_proc.distill(
                     teacher_pipe,
                     examples,
-                    sgd=None,
+                    sgd=sgd,
                     losses=losses,
                     **component_cfg[student_name],
                 )
 
-        # Only finish the update after all component updates are done. Some
-        # components may share weights (such as tok2vec) and we only want
-        # to apply weight updates after all gradients are accumulated.
-        for student_name, student_proc in self.pipeline:
-            if (
-                student_name not in exclude
-                and isinstance(student_proc, ty.DistillableComponent)
-                and student_proc.is_distillable
-                and sgd not in (None, False)
-            ):
-                student_proc.finish_update(sgd)
-
         return losses
 
     def disable_pipes(self, *names) -> "DisabledPipes":
@@ -1919,7 +1905,7 @@ def from_config(
         # using the nlp.config with all defaults.
         config = util.copy_config(config)
         orig_pipeline = config.pop("components", {})
-        orig_distill = config.pop("distillation", None)
+        orig_distill = config.pop("distill", None)
         orig_pretraining = config.pop("pretraining", None)
         config["components"] = {}
         if auto_fill:
@@ -1929,8 +1915,8 @@ def from_config(
         filled["components"] = orig_pipeline
         config["components"] = orig_pipeline
         if orig_distill is not None:
-            filled["distillation"] = orig_distill
-            config["distillation"] = orig_distill
+            filled["distill"] = orig_distill
+            config["distill"] = orig_distill
         if orig_pretraining is not None:
             filled["pretraining"] = orig_pretraining
             config["pretraining"] = orig_pretraining
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index ff56357807e..546a1c48abb 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -74,8 +74,8 @@ cdef class TrainablePipe(Pipe):
         teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
             from.
         examples (Iterable[Example]): Distillation examples. The reference
-            and predicted docs must have the same number of tokens and the
-            same orthography.
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
         drop (float): dropout rate.
         sgd (Optional[Optimizer]): An optimizer. Will be created via
             create_optimizer if not set.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 4653fd7e600..815f6cc4400 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -254,8 +254,8 @@ class Parser(TrainablePipe):
         teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
             from.
         examples (Iterable[Example]): Distillation examples. The reference
-            and predicted docs must have the same number of tokens and the
-            same orthography.
+            (teacher) and predicted (student) docs must have the same number of
+            tokens and the same orthography.
         drop (float): dropout rate.
         sgd (Optional[Optimizer]): An optimizer. Will be created via
             create_optimizer if not set.
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 25352d2bb16..8abb3f88b38 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -85,6 +85,12 @@
 ]
 
 
+TAGGER_TRAIN_DATA = [
+    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
+
 def evil_component(doc):
     if "2" in doc.text:
         raise ValueError("no dice")
diff --git a/website/docs/api/dependencyparser.mdx b/website/docs/api/dependencyparser.mdx
index 5179ce48b84..296d6d87da5 100644
--- a/website/docs/api/dependencyparser.mdx
+++ b/website/docs/api/dependencyparser.mdx
@@ -154,15 +154,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## DependencyParser.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx
index 2e099365758..c8b5c71806b 100644
--- a/website/docs/api/edittreelemmatizer.mdx
+++ b/website/docs/api/edittreelemmatizer.mdx
@@ -138,15 +138,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## EditTreeLemmatizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/entityrecognizer.mdx b/website/docs/api/entityrecognizer.mdx
index 005d5d11deb..f503cc998b0 100644
--- a/website/docs/api/entityrecognizer.mdx
+++ b/website/docs/api/entityrecognizer.mdx
@@ -150,15 +150,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## EntityRecognizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index d65ea376431..f3fad41314e 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -361,6 +361,34 @@ Distill the models in a student pipeline from a teacher pipeline.
 | `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                                     |
 | **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                              |
 
+## Language.distill {id="distill",tag="method,experimental",version="4"}
+
+Distill the models in a student pipeline from a teacher pipeline.
+
+> #### Example
+>
+> ```python
+>
+> teacher = spacy.load("en_core_web_lg")
+> student = English()
+> student.add_pipe("tagger")
+> student.distill(teacher, examples, sgd=optimizer)
+> ```
+
+| Name                 | Description                                                                                                                                                                                 |
+| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher`            | The teacher pipeline to distill from. ~~Language~~                                                                                                                                          |
+| `examples`           | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_       |                                                                                                                                                                                             |
+| `drop`               | The dropout rate. ~~float~~                                                                                                                                                                 |
+| `sgd`                | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`             | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                                                             |
+| `component_cfg`      | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                              |
+| `exclude`            | Names of components that shouldn't be updated. Defaults to `[]`. ~~Iterable[str]~~                                                                                                          |
+| `annotates`          | Names of components that should set annotations on the prediced examples after updating. Defaults to `[]`. ~~Iterable[str]~~                                                                |
+| `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                              |
+| **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+
 ## Language.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 4f79458d319..4660ec312fa 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -144,15 +144,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## Morphologizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx
index 120c8f6908f..e1e7f5d7021 100644
--- a/website/docs/api/pipe.mdx
+++ b/website/docs/api/pipe.mdx
@@ -257,15 +257,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## TrainablePipe.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx
index 02fd57102e2..dfb7ed308ba 100644
--- a/website/docs/api/sentencerecognizer.mdx
+++ b/website/docs/api/sentencerecognizer.mdx
@@ -129,15 +129,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## SentenceRecognizer.pipe {id="pipe",tag="method"}
 
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index 664fd7940c1..35e7a23b174 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -128,15 +128,15 @@ This feature is experimental.
 > losses = student.distill(teacher_pipe, examples, sgd=optimizer)
 > ```
 
-| Name           | Description                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       |
+| Name           | Description                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
+| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                             |
+| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
 ## Tagger.pipe {id="pipe",tag="method"}
 

From 307430ff7c12b8ca7bf690c2dee3b25fa9aad5e5 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 31 Jan 2023 19:31:17 +0900
Subject: [PATCH 368/504] Don't re-download installed models (#12188)

* Don't re-download installed models

When downloading a model, this checks if the same version of the same
model is already installed. If it is then the download is skipped.

This is necessary because pip uses the final download URL for its
caching feature, but because of the way models are hosted on Github,
their URLs change every few minutes.

* Use importlib instead of meta.json

* Use get_package_version

* Add untested, disabled test

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 .github/azure-steps.yml | 5 +++++
 spacy/cli/download.py   | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index fc83d4994b4..11dc7e295e4 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -74,6 +74,11 @@ steps:
 #  - script: |
 #      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
 #    displayName: 'Test no warnings on load (#11713)'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping
+#    displayName: 'Test skip re-download (#12188)'
 #    condition: eq(variables['python_version'], '3.8')
 
   - script: |
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0b8ed54ed3c..ea7a06c5417 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -8,7 +8,8 @@
 
 from .. import about
 from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version
+from ..util import is_prerelease_version, get_installed_models
+from ..util import get_package_version
 
 
 @app.command(

From 7b5d5081d75e8f805a570eafc1330046bb5d86cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 31 Jan 2023 13:06:02 +0100
Subject: [PATCH 369/504] Add the configuration schema for distillation
 (#12201)

* Add the configuration schema for distillation

This also adds the default configuration and some tests. The schema will
be used by the training loop and `distill` subcommand.

* Format

* Change distillation shortopt to -d

* Fix descripion of max_epochs

* Rename distillation flag to -dt

* Rename `pipe_map` to `student_to_teacher`
---
 spacy/schemas.py                              | 24 ++++++++++++++++-
 .../tests/serialize/test_serialize_config.py  | 26 +++++--------------
 2 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/spacy/schemas.py b/spacy/schemas.py
index 49bd4fe93e2..7418ede3226 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -477,6 +477,27 @@ class Config:
         arbitrary_types_allowed = True
 
 
+class ConfigSchemaDistillEmpty(BaseModel):
+    class Config:
+        extra = "forbid"
+
+
+class ConfigSchemaDistill(BaseModel):
+    # fmt: off
+    batcher: Batcher = Field(..., title="Batcher for the training data")
+    corpus: StrictStr = Field(..., title="Path in the config to the distillation data")
+    dropout: StrictFloat = Field(..., title="Dropout rate")
+    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to distill for")
+    max_steps: StrictInt = Field(..., title="Maximum number of steps to distill for")
+    optimizer: Optimizer = Field(..., title="The optimizer to use")
+    student_to_teacher: Dict[str, str] = Field(..., title="Mapping from student to teacher pipe")
+    # fmt: on
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
 class ConfigSchema(BaseModel):
     training: ConfigSchemaTraining
     nlp: ConfigSchemaNlp
@@ -485,6 +506,7 @@ class ConfigSchema(BaseModel):
     corpora: Dict[str, Reader]
     initialize: ConfigSchemaInit
     distillation: Union[ConfigSchemaDistill, ConfigSchemaDistillEmpty] = {}  # type: ignore[assignment]
+    distillation: Union[ConfigSchemaDistill, ConfigSchemaDistillEmpty] = {}  # type: ignore[assignment]
 
     class Config:
         extra = "allow"
@@ -496,7 +518,7 @@ class Config:
     "training": ConfigSchemaTraining,
     "pretraining": ConfigSchemaPretrain,
     "initialize": ConfigSchemaInit,
-    "distillation": ConfigSchemaDistill,
+    "distill": ConfigSchemaDistill,
 }
 
 # Recommendations for init config workflows
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index b351ea80121..eb0dcc1e38c 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -5,26 +5,14 @@
 import spacy
 from spacy.lang.de import German
 from spacy.lang.en import English
-from spacy.language import (
-    DEFAULT_CONFIG,
-    DEFAULT_CONFIG_DISTILL_PATH,
-    DEFAULT_CONFIG_PRETRAIN_PATH,
-    Language,
-)
-from spacy.ml.models import (
-    MaxoutWindowEncoder,
-    MultiHashEmbed,
-    build_tb_parser_model,
-    build_Tok2Vec_model,
-)
+from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
+from spacy.language import DEFAULT_CONFIG_DISTILL_PATH
+from spacy.language import Language
+from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
+from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
 from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
-from spacy.training import Example
-from spacy.util import (
-    load_config,
-    load_config_from_str,
-    load_model_from_config,
-    registry,
-)
+from spacy.util import load_config, load_config_from_str
+from spacy.util import load_model_from_config, registry
 
 from ..util import make_tempdir
 

From 051cfe3df5d8d57f44221eedfb2fc55181dd17e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 31 Jan 2023 13:19:42 +0100
Subject: [PATCH 370/504] Language.distill: copy both reference and predicted
 (#12209)

* Language.distill: copy both reference and predicted

In distillation we also modify the teacher docs (e.g. in tok2vec
components), so we need to copy both the reference and predicted doc.

Problem caught by @shadeMe

* Make new `_copy_examples` args kwonly
---
 spacy/language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index f44631c90a9..e9433633b89 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1085,7 +1085,7 @@ def distill(
             return losses
 
         validate_distillation_examples(examples, "Language.distill")
-        examples = _copy_examples(examples)
+        examples = _copy_examples(examples, copy_x=True, copy_y=True)
 
         if sgd is None:
             if self._optimizer is None:

From 148574e73ed2f522c8c1a88ac96a68e6c59fa8b5 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Tue, 31 Jan 2023 17:30:43 +0100
Subject: [PATCH 371/504] Rename language codes (Icelandic, multi-language)
 (#12149)

* Init

* fix tests

* Update spacy/errors.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Fix test_blank_languages

* Rename xx to mul in docs

* Format _util with black

* prettier formatting

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/_util.py                        |  1 +
 spacy/cli/convert.py                      |  2 ++
 spacy/cli/init_config.py                  | 13 +++----------
 spacy/cli/init_pipeline.py                | 13 ++-----------
 spacy/errors.py                           |  1 +
 spacy/training/converters/json_to_docs.py | 12 ++++--------
 6 files changed, 13 insertions(+), 29 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 977912443bd..644f3e5ef24 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -37,6 +37,7 @@
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
 from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
+from ..errors import RENAMED_LANGUAGE_CODES
 from .. import about
 from ..errors import RENAMED_LANGUAGE_CODES
 from ..schemas import ProjectConfigSchema, validate
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index a282e59c749..19591a05c94 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -8,6 +8,8 @@
 import srsly
 from wasabi import Printer
 
+from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
+from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training import docs_to_json
 from ..training.converters import (
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index ca0c316ca20..b29a2b748f2 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -12,16 +12,9 @@
 from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
-from ._util import (
-    COMMAND,
-    Arg,
-    Opt,
-    _handle_renamed_language_codes,
-    import_code,
-    init_cli,
-    show_validation_error,
-    string_to_list,
-)
+from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
+from ._util import string_to_list, import_code, _handle_renamed_language_codes
+
 
 ROOT = Path(__file__).parent / "templates"
 TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 991dc1a822c..1a044dedbc9 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -8,17 +8,8 @@
 
 from .. import util
 from ..language import Language
-from ..training.initialize import convert_vectors, init_nlp
-from ._util import (
-    Arg,
-    Opt,
-    _handle_renamed_language_codes,
-    import_code,
-    init_cli,
-    parse_config_overrides,
-    setup_gpu,
-    show_validation_error,
-)
+from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code, setup_gpu, _handle_renamed_language_codes
 
 
 @init_cli.command("vectors")
diff --git a/spacy/errors.py b/spacy/errors.py
index dd64465cd91..04edfc7fe38 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -985,6 +985,7 @@ class Errors(metaclass=ErrorsWithCodes):
              "reference and predicted docs.")
     E4004 = ("Backprop is not supported when is_train is not set.")
 
+RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
 # fmt: on
 
diff --git a/spacy/training/converters/json_to_docs.py b/spacy/training/converters/json_to_docs.py
index a78c39aea7b..1ff7a64e09d 100644
--- a/spacy/training/converters/json_to_docs.py
+++ b/spacy/training/converters/json_to_docs.py
@@ -1,13 +1,9 @@
 import srsly
-
-from ...lang.mul import MultiLanguage
-from ...util import load_model
-from ..example import (
-    _fix_legacy_dict_data,
-    _parse_example_dict_data,
-    annotations_to_doc,
-)
 from ..gold_io import json_iterate, json_to_annotations
+from ..example import annotations_to_doc
+from ..example import _fix_legacy_dict_data, _parse_example_dict_data
+from ...util import load_model
+from ...lang.mul import MultiLanguage
 
 
 def json_to_docs(input_data, model=None, **kwargs):

From 509572e494fa23e1466084b98650de3adbbc7989 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 1 Feb 2023 17:47:56 +0900
Subject: [PATCH 372/504] Move Entity Linker v1 to spacy-legacy (#12006)

* Move Entity Linker v1 component to spacy-legacy

This is a follow up to #11889 that moves the component instead of
removing it.

In general, we never import from spacy-legacy in spaCy proper. However,
to use this component, that kind of import will be necessary. I was able
to test this without issues, but is this current import strategy
acceptable? Or should we put the component in a registry?

* Use spacy-legacy pr for CI

This will need to be reverted before merging.

* Add temporary step to log installed spacy-legacy version

* Modify requirements.txt to trigger tests

* Add comment to Python to trigger tests

* TODO REVERT This is a commit with logic changes to trigger tests

* Remove pipe from YAML

Works locally, but possibly this is causing a quoting error or
something.

* Revert "TODO REVERT This is a commit with logic changes to trigger tests"

This reverts commit 689fae71f31de4f54a00dd7dae0c26b19563c027.

* Revert "Add comment to Python to trigger tests"

This reverts commit 11840fc59886658c59aeb186a20173f5ec7c4583.

* Add more logging

* Try installing directly in workflow

* Try explicitly uninstalling spacy-legacy first

* Cat requirements.txt to confirm contents

In the branch, the thinc version spec is `thinc>=8.1.0,<8.2.0`. But in
the logs, it's clear that a development release of 9.0 is being
installed. It's not clear why that would happen.

* Log requirements at start of build

* TODO REVERT Change thinc spec

Want to see what happens to the installed thinc spec with this change.

* Update thinc requirements

This makes it the same as it was before the merge, >=8.1.0,<8.2.0.

* Use same thinc version as v4 branch

* TODO REVERT Mark dependency check as xfail

spacy-legacy is specified as a git checkout in requirements.txt while
this PR is in progress, which makes the consistency check here fail.

* Remove debugging output / install step

* Revert "Remove debugging output / install step"

This reverts commit 923ea7448b5e819d73272bc4e43e8880a8598a07.

* Clean up debugging output

The manual install step with the URL fragment seems to have caused
issues on Windows due to the = in the URL being misinterpreted. On the
other hand, removing it seems to mean the git version of spacy-legacy
isn't actually installed.

This PR removes the URL fragment but keeps the direct command-line
install. Additionally, since it looks like this job is configured to use
the default shell (and not bash), it removes a comment that upsets the
Windows cmd shell.

* Revert "TODO REVERT Mark dependency check as xfail"

This reverts commit d4863ec1563b7819c31a865cb94262b7dc592b7e.

* Fix requirements.txt, increasing spacy-legacy version

* Raise spacy legacy version in setup.cfg

* Remove azure build workarounds

* make spacy-legacy version explicit in error message

* Remove debugging line

* Suggestions from code review
---
 spacy/pipeline/entity_linker.py            | 34 ++++++++++++++++++++--
 spacy/tests/pipeline/test_entity_linker.py |  3 +-
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 5ecf7f6e272..bd7040279b3 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -16,8 +16,18 @@
 from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate
 from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
 
+from ..kb import KnowledgeBase, Candidate
+from ..ml import empty_kb
+from ..tokens import Doc, Span
+from .pipe import deserialize_config
+from .trainable_pipe import TrainablePipe
+from ..language import Language
+from ..vocab import Vocab
+from ..training import Example, validate_examples, validate_get_examples
+from ..errors import Errors
+from ..util import SimpleFrozenList, registry
 from .. import util
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..kb import Candidate, KnowledgeBase
 from ..language import Language
 from ..scorer import Scorer
@@ -129,8 +139,26 @@ def make_entity_linker(
     save_activations (bool): save model activations in Doc when annotating.
     """
     if not model.attrs.get("include_span_maker", False):
-        raise ValueError(Errors.E4005)
-
+        try:
+            from spacy_legacy.components.entity_linker import EntityLinker_v1
+        except:
+            raise ImportError(
+                "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
+            )
+        # The only difference in arguments here is that use_gold_ents and threshold aren't available.
+        return EntityLinker_v1(
+            nlp.vocab,
+            model,
+            name,
+            labels_discard=labels_discard,
+            n_sents=n_sents,
+            incl_prior=incl_prior,
+            incl_context=incl_context,
+            entity_vector_length=entity_vector_length,
+            get_candidates=get_candidates,
+            overwrite=overwrite,
+            scorer=scorer,
+        )
     return EntityLinker(
         nlp.vocab,
         model,
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index d2d07929a70..33fdaafa4ac 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -12,7 +12,6 @@
 from spacy.lang.en import English
 from spacy.ml import load_kb
 from spacy.pipeline import EntityLinker, TrainablePipe
-from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tests.util import make_tempdir
@@ -1089,6 +1088,8 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
+    from spacy_legacy.components.entity_linker import EntityLinker_v1
+
     # Ensure that the legacy architectures still work
     vector_length = 3
     nlp = English()

From 3950247be74e51935f958c46f77a4a41a5946c28 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 2 Feb 2023 22:13:38 +0900
Subject: [PATCH 373/504] Cleanup/remove backwards compat overwrite settings
 (#11888)

* Remove backwards-compatible overwrite from Entity Linker

This also adds a docstring about overwrite, since it wasn't present.

* Fix docstring

* Remove backward compat settings in Morphologizer

This also needed a docstring added.

For this component it's less clear what the right overwrite settings
are.

* Remove backward compat from sentencizer

This was simple

* Remove backward compat from senter

Another simple one

* Remove backward compat setting from tagger

* Add docstrings

* Update spacy/pipeline/morphologizer.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update docs

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/pipeline/entity_linker.py    | 3 ---
 spacy/pipeline/morphologizer.pyx   | 4 ----
 spacy/pipeline/sentencizer.pyx     | 1 -
 spacy/pipeline/senter.pyx          | 3 ++-
 spacy/pipeline/tagger.pyx          | 3 ---
 website/docs/api/entitylinker.mdx  | 2 +-
 website/docs/api/morphologizer.mdx | 2 +-
 7 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index bd7040279b3..ffed68e06d8 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -43,9 +43,6 @@
 
 KNOWLEDGE_BASE_IDS = "kb_ids"
 
-# See #9050
-BACKWARD_OVERWRITE = True
-
 default_model_config = """
 [model]
 @architectures = "spacy.EntityLinker.v2"
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 92c1fed8efb..5edd922019d 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -29,10 +29,6 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .tagger import ActivationsT, Tagger
 
-# See #9050
-BACKWARD_OVERWRITE = True
-BACKWARD_EXTEND = False
-
 default_model_config = """
 [model]
 @architectures = "spacy.Tagger.v2"
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 6dd62ed8577..9087bff0cdc 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -11,7 +11,6 @@ from ..scorer import Scorer
 from .pipe import Pipe
 from .senter import senter_score
 
-
 @Language.factory(
     "sentencizer",
     assigns=["token.is_sent_start", "doc.sents"],
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index d8fdf8f739f..e83dd789152 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -18,7 +18,8 @@ from ..language import Language
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import ActivationsT, Tagger
+from .tagger import Tagger
+
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index cc43caa72c8..a73461ee74a 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -32,9 +32,6 @@ from .trainable_pipe import TrainablePipe
 
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
-# See #9050
-BACKWARD_OVERWRITE = False
-
 default_model_config = """
 [model]
 @architectures = "spacy.Tagger.v2"
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index 238b62a2e6d..12b2f6bef1d 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -63,7 +63,7 @@ architectures and their arguments and hyperparameters.
 | `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
 | `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
 | `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                    |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                   |
 | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
 | `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 4660ec312fa..9514bc773b9 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters.
 | Setting                                         | Description                                                                                                                                                                                                                                                            |
 | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `model`                                         | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                 |
 | `extend` <Tag variant="new">3.2</Tag>           | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
 | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                       |

From 11630759ab0707566b6910278a6756ef2f6097b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 3 Feb 2023 15:22:25 +0100
Subject: [PATCH 374/504] `Language.update`: ensure that tok2vec gets updated
 (#12136)

* `Language.update`: ensure that tok2vec gets updated

The components in a pipeline can be updated independently. However,
tok2vec implementations are an exception to this, since they depend on
listeners for their gradients. The update method of a tok2vec
implementation computes the tok2vec forward and passes this along with a
backprop function to the listeners. This backprop function accumulates
gradients for all the listeners. There are two ways in which the
accumulated gradients can be used to update the tok2vec weights:

1. Call the `finish_update` method of tok2vec *after* the `update`
   method is called on all of the pipes that use a tok2vec listener.
2. Pass an optimizer to the `update` method of tok2vec. In this
   case, tok2vec will give the last listener a special backprop
   function that calls `finish_update` on the tok2vec.

Unfortunately, `Language.update` did neither of these. Instead, it
immediately called `finish_update` on every pipe after `update`. As a
result, the tok2vec weights are updated when no gradients have been
accumulated from listeners yet. And the gradients of the listeners are
only used in the next call to `Language.update` (when `finish_update` is
called on tok2vec again).

This change fixes this issue by passing the optimizer to the `update`
method of trainable pipes, leading to use of the second strategy
outlined above.

The main updating loop in `Language.update` is also simplified by using
the `TrainableComponent` protocol consistently.

* Train loop: `sgd` is `Optional[Optimizer]`, do not pass false

* Language.update: call pipe finish_update after all pipe updates

This does correct and fast updates if multiple components update the
same parameters.

* Add comment why we moved `finish_update` to a separate loop
---
 spacy/language.py            |  1 -
 spacy/tests/test_language.py | 38 ++++++------------------------------
 spacy/training/loop.py       |  2 +-
 3 files changed, 7 insertions(+), 34 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index e9433633b89..913c2cff6b6 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1299,7 +1299,6 @@ def update(
                 name not in exclude
                 and isinstance(proc, ty.TrainableComponent)
                 and proc.is_trainable
-                and sgd not in (None, False)
             ):
                 proc.finish_update(sgd)
 
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 8abb3f88b38..3a3cdf4a353 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -13,14 +13,12 @@
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, Span
 from spacy.training import Example
-from spacy.util import (
-    find_matching_language,
-    ignore_error,
-    load_model_from_config,
-    raise_error,
-    registry,
-)
-from spacy.vocab import Vocab
+from spacy.lang.en import English
+from spacy.lang.de import German
+from spacy.util import registry, ignore_error, raise_error, find_matching_language
+from spacy.util import load_model_from_config
+import spacy
+from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
 
 from .util import add_vecs_to_vocab, assert_docs_equal
 
@@ -85,12 +83,6 @@
 ]
 
 
-TAGGER_TRAIN_DATA = [
-    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
-    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
-]
-
-
 def evil_component(doc):
     if "2" in doc.text:
         raise ValueError("no dice")
@@ -170,24 +162,6 @@ def test_language_update_updates():
     )
 
 
-def test_language_update_does_not_update_with_sgd_false():
-    config = Config().from_str(TAGGER_CFG_STRING)
-    nlp = load_model_from_config(config, auto_fill=True, validate=True)
-
-    train_examples = []
-    for t in TAGGER_TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-
-    nlp.initialize(get_examples=lambda: train_examples)
-
-    docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
-    nlp.update(train_examples, sgd=False)
-    docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
-
-    xp = get_array_module(docs_after_update[0].tensor)
-    xp.testing.assert_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
-
-
 def test_language_evaluate(nlp):
     text = "hello world"
     annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 0f8d561b9a6..362cdd1e959 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -469,7 +469,7 @@ def train_while_improving(
                 subbatch,
                 drop=dropout,
                 losses=losses,
-                sgd=False,
+                sgd=None,
                 exclude=exclude,
                 annotates=annotating_components,
             )

From ac5f1be0c9d0008d754eb44e9e72dc697b0fafd7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:28:34 +0100
Subject: [PATCH 375/504] Use the same tuple in Span cmp and hash (#12251)

---
 spacy/tokens/span.pyx | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 1c19cc6d495..9c44639343f 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -136,9 +136,8 @@ cdef class Span:
             else:
                 return True
 
-        cdef SpanC* span_c = self.span_c()
-        cdef SpanC* other_span_c = other.span_c()
-
+        self_tuple = self._cmp_tuple()
+        other_tuple = other._cmp_tuple()
         # <
         if op == 0:
             return span_c.start_char < other_span_c.start_char
@@ -193,8 +192,20 @@ cdef class Span:
             return span_c.start_char >= other_span_c.start_char
 
     def __hash__(self):
+        return hash(self._cmp_tuple())
+
+    def _cmp_tuple(self):
         cdef SpanC* span_c = self.span_c()
-        return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id))
+        return (
+            span_c.start_char,
+            span_c.end_char,
+            span_c.start,
+            span_c.end,
+            span_c.label,
+            span_c.kb_id,
+            span_c.id,
+            self.doc,
+        )
 
     def __len__(self):
         """Get the number of tokens in the span.

From 252515e255dd0fc17225084745dadb84c17f04d6 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:37:42 +0100
Subject: [PATCH 376/504] Remove names for vectors (#12243)

* Remove names for vectors

Named vectors are basically a carry-over from v2 and aren't used for
anything.

* Format
---
 spacy/vectors.pyx | 2 +-
 spacy/vocab.pyx   | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 111a9d01e08..876c56bed1d 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -130,7 +130,7 @@ cdef class Vectors(BaseVectors):
     cdef readonly unicode eow
     cdef readonly attr_id_t attr
 
-    def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">", attr="ORTH"):
+    def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
         """Create a new vector store.
 
         strings (StringStore): The string store.
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index de543c25d88..f55f4c61959 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -49,8 +49,9 @@ cdef class Vocab:
 
     DOCS: https://spacy.io/api/vocab
     """
-    def __init__(self, lex_attr_getters=None, strings=None, lookups=None,
-            oov_prob=-20., writing_system=None, get_noun_chunks=None):
+    def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
+                 oov_prob=-20., writing_system={}, get_noun_chunks=None,
+                 **deprecated_kwargs):
         """Create the vocabulary.
 
         lex_attr_getters (dict): A dictionary mapping attribute IDs to

From 4b892dc26fc3a51c1b120bf178d09654d3554d48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 21 Feb 2023 15:47:18 +0100
Subject: [PATCH 377/504] Reimplement distillation with oracle cut size
 (#12214)

* Improve the correctness of _parse_patch

* If there are no more actions, do not attempt to make further
  transitions, even if not all states are final.
* Assert that the number of actions for a step is the same as
  the number of states.

* Reimplement distillation with oracle cut size

The code for distillation with an oracle cut size was not reimplemented
after the parser refactor. We did not notice, because we did not have
tests for this functionality. This change brings back the functionality
and adds this to the parser tests.

* Rename states2actions to _states_to_actions for consistency

* Test distillation max cuts in NER

* Mark parser/NER tests as slow

* Typo

* Fix invariant in _states_diff_to_actions

* Rename _init_batch -> _init_batch_from_teacher

* Ninja edit the ninja edit

* Check that we raise an exception when we pass the incorrect number or actions

* Remove unnecessary get

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Write out condition more explicitly

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/ml/tb_framework.pyx        |  4 ++-
 spacy/tests/parser/test_model.py | 61 ++++++++++++++++++++++++++++++++
 spacy/tests/parser/test_ner.py   |  5 ++-
 spacy/tests/parser/test_parse.py |  5 ++-
 4 files changed, 72 insertions(+), 3 deletions(-)
 create mode 100644 spacy/tests/parser/test_model.py

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 79be13b00bd..9b2114900d3 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
     cdef np.ndarray step_actions
 
     scores = []
-    while sizes.states >= 1:
+    while sizes.states >= 1 and (actions is None or len(actions) > 0):
         step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
         step_actions = actions[0] if actions is not None else None
+        assert step_actions is None or step_actions.size == sizes.states, \
+            f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
         with nogil:
             _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
             if actions is None:
diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py
new file mode 100644
index 00000000000..8c1cf7a9346
--- /dev/null
+++ b/spacy/tests/parser/test_model.py
@@ -0,0 +1,61 @@
+import numpy
+import pytest
+
+from spacy.lang.en import English
+from spacy.ml.tb_framework import TransitionModelInputs
+from spacy.training import Example
+
+TRAIN_DATA = [
+    (
+        "They trade mortgage-backed securities.",
+        {
+            "heads": [1, 1, 4, 4, 5, 1, 1],
+            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
+        },
+    ),
+    (
+        "I like London and Berlin.",
+        {
+            "heads": [1, 1, 1, 2, 2, 1],
+            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
+        },
+    ),
+]
+
+
+@pytest.fixture
+def nlp_parser():
+    nlp = English()
+    parser = nlp.add_pipe("parser")
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        for dep in annotations["deps"]:
+            parser.add_label(dep)
+    nlp.initialize()
+
+    return nlp, parser
+
+
+def test_incorrect_number_of_actions(nlp_parser):
+    nlp, parser = nlp_parser
+    doc = nlp.make_doc("test")
+
+    # Too many actions for the number of docs
+    with pytest.raises(AssertionError):
+        parser.model.predict(
+            TransitionModelInputs(
+                docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
+            )
+        )
+
+    # Too few actions for the number of docs
+    with pytest.raises(AssertionError):
+        parser.model.predict(
+            TransitionModelInputs(
+                docs=[doc, doc],
+                moves=parser.moves,
+                actions=[numpy.array([0], dtype="i")],
+            )
+        )
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 4cdaba2fe32..d9cbf5e8c72 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -625,7 +625,9 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -643,6 +645,7 @@ def test_distill():
 
     student = English()
     student_ner = student.add_pipe("ner")
+    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 71acb51d455..6f0e6b19841 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -460,7 +460,9 @@ def test_is_distillable():
     assert parser.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []
@@ -478,6 +480,7 @@ def test_distill():
 
     student = English()
     student_parser = student.add_pipe("parser")
+    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
     student_parser.initialize(
         get_examples=lambda: train_examples, labels=teacher_parser.label_data
     )

From 03a1998236574cb8eff1de2c5ba8bfb141a935c4 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Tue, 7 Mar 2023 13:10:45 +0100
Subject: [PATCH 378/504] Drop support for EntityLinker_v1. (#12377)

---
 spacy/errors.py                            |  1 +
 spacy/pipeline/entity_linker.py            | 22 ++--------------------
 spacy/tests/pipeline/test_entity_linker.py |  1 -
 3 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 04edfc7fe38..1e23f956cef 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -984,6 +984,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4003 = ("Training examples for distillation must have the exact same tokens in the "
              "reference and predicted docs.")
     E4004 = ("Backprop is not supported when is_train is not set.")
+    E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index ffed68e06d8..80d35607a67 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -136,26 +136,8 @@ def make_entity_linker(
     save_activations (bool): save model activations in Doc when annotating.
     """
     if not model.attrs.get("include_span_maker", False):
-        try:
-            from spacy_legacy.components.entity_linker import EntityLinker_v1
-        except:
-            raise ImportError(
-                "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
-            )
-        # The only difference in arguments here is that use_gold_ents and threshold aren't available.
-        return EntityLinker_v1(
-            nlp.vocab,
-            model,
-            name,
-            labels_discard=labels_discard,
-            n_sents=n_sents,
-            incl_prior=incl_prior,
-            incl_context=incl_context,
-            entity_vector_length=entity_vector_length,
-            get_candidates=get_candidates,
-            overwrite=overwrite,
-            scorer=scorer,
-        )
+        raise ValueError(Errors.E4005)
+
     return EntityLinker(
         nlp.vocab,
         model,
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 33fdaafa4ac..e541276d198 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1088,7 +1088,6 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
-    from spacy_legacy.components.entity_linker import EntityLinker_v1
 
     # Ensure that the legacy architectures still work
     vector_length = 3

From d441c3c01d8edaebc31ebd73c0f1859514a3f7ca Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 9 Mar 2023 09:37:19 +0100
Subject: [PATCH 379/504] `Tok2Vec`: Add `distill` method (#12108)

* `Tok2Vec`: Add `distill` method

* `Tok2Vec`: Refactor `update`

* Add `Tok2Vec.distill` test

* Update `distill` signature to accept `Example`s instead of separate teacher and student docs

* Add docs

* Remove docstring

* Update test

* Remove `update` calls from test

* Update `Tok2Vec.distill` docstring
---
 spacy/pipeline/tok2vec.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 92aec22b7a7..0be04232caa 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -1,3 +1,6 @@
+from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple
+from thinc.api import Model, set_dropout_rate, Optimizer, Config
+from thinc.types import Floats2d
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
 

From fcb5a69d23b7b25f2ec3bf6115b0e226957bf972 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 19 Mar 2023 23:41:20 +0100
Subject: [PATCH 380/504] Clean up Vocab constructor (#12290)

* Clean up Vocab constructor

* Change effective type of `strings` from `Iterable[str]` to `Optional[StringStore]`
  * Don't automatically add strings to vocab
* Change default values to `None`
* Remove `**deprecated_kwargs`

* Format
---
 spacy/strings.pyi | 2 +-
 spacy/vocab.pyx   | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/spacy/strings.pyi b/spacy/strings.pyi
index 1da5443fb11..64dceeb7726 100644
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@@ -3,7 +3,7 @@ from pathlib import Path
 from typing import Any, Iterable, Iterator, Optional, Union, overload
 
 class StringStore:
-    def __init__(self, strings: Optional[Iterable[str]]) -> None: ...
+    def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ...
     @overload
     def __getitem__(self, string_or_hash: str) -> int: ...
     def __getitem__(self, string_or_hash: str) -> int: ...
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index f55f4c61959..de543c25d88 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -49,9 +49,8 @@ cdef class Vocab:
 
     DOCS: https://spacy.io/api/vocab
     """
-    def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
-                 oov_prob=-20., writing_system={}, get_noun_chunks=None,
-                 **deprecated_kwargs):
+    def __init__(self, lex_attr_getters=None, strings=None, lookups=None,
+            oov_prob=-20., writing_system=None, get_noun_chunks=None):
         """Create the vocabulary.
 
         lex_attr_getters (dict): A dictionary mapping attribute IDs to

From 0c94acad45ea46340ddb0514b55c96dbbfaf5a5d Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 20 Mar 2023 00:34:35 +0100
Subject: [PATCH 381/504] Introduce hierarchy for EL `Candidate` objects
 (#12341)

* Convert Candidate from Cython to Python class.

* Format.

* Fix .entity_ typo in _add_activations() usage.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update doc string of BaseCandidate.__init__().

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate.

* Adjust Candidate to support and mandate numerical entity IDs.

* Format.

* Fix docstring and docs.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename alias -> mention.

* Refactor Candidate attribute names. Update docs and tests accordingly.

* Refacor Candidate attributes and their usage.

* Format.

* Fix mypy error.

* Update error code in line with v4 convention.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Updated error code.

* Simplify interface for int/str representations.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename 'alias' to 'mention'.

* Port Candidate and InMemoryCandidate to Cython.

* Remove redundant entry in setup.py.

* Add abstract class check.

* Drop storing mention.

* Update spacy/kb/candidate.pxd

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fix entity_id refactoring problems in docstrings.

* Drop unused InMemoryCandidate._entity_hash.

* Update docstrings.

* Move attributes out of Candidate.

* Partially fix alias/mention terminology usage. Convert Candidate to interface.

* Remove prior_prob from supported properties in Candidate. Introduce KnowledgeBase.supports_prior_probs().

* Update docstrings related to prior_prob.

* Update alias/mention usage in doc(strings).

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Mention -> alias renaming. Drop Candidate.mentions(). Drop InMemoryLookupKB.get_alias_candidates() from docs.

* Update docstrings.

* Fix InMemoryCandidate attribute names.

* Update spacy/kb/kb.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update W401 test.

* Update spacy/errors.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/kb/kb.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Use Candidate output type for toy generators in the test suite to mimick best practices

* fix docs

* fix import

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/errors.py                            | 6 ++++++
 spacy/tests/pipeline/test_entity_linker.py | 1 +
 2 files changed, 7 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index 1e23f956cef..776fdd40f7b 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -216,7 +216,11 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
+    # v4 warning strings
     W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
+    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
+            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
+            "to return `True` in `.supports_prior_probs`.")
 
 
 class Errors(metaclass=ErrorsWithCodes):
@@ -985,6 +989,8 @@ class Errors(metaclass=ErrorsWithCodes):
              "reference and predicted docs.")
     E4004 = ("Backprop is not supported when is_train is not set.")
     E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
+    E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
+
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index e541276d198..44f985956bd 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -11,6 +11,7 @@
 from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
 from spacy.lang.en import English
 from spacy.ml import load_kb
+from spacy.ml.models.entity_linker import build_span_maker, get_candidates
 from spacy.pipeline import EntityLinker, TrainablePipe
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer

From fca78597d3f204c71547d510f8f424c5de9c5886 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 20 Mar 2023 12:25:18 +0100
Subject: [PATCH 382/504] Entity linking: use `SpanGroup` instead of
 `Iterable[Span]` for mentions (#12344)

* Convert Candidate from Cython to Python class.

* Format.

* Fix .entity_ typo in _add_activations() usage.

* Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span].

* Update docs.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update doc string of BaseCandidate.__init__().

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate.

* Adjust Candidate to support and mandate numerical entity IDs.

* Format.

* Fix docstring and docs.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename alias -> mention.

* Refactor Candidate attribute names. Update docs and tests accordingly.

* Refacor Candidate attributes and their usage.

* Format.

* Fix mypy error.

* Update error code in line with v4 convention.

* Reverse erroneous changes during merge.

* Update return type in EL tests.

* Re-add Candidate to setup.py.

* Format updated docs.

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/entity_linker.py            | 37 ++++++++++------------
 spacy/tests/pipeline/test_entity_linker.py |  1 -
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 80d35607a67..eb87d1db987 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -19,6 +19,8 @@
 from ..kb import KnowledgeBase, Candidate
 from ..ml import empty_kb
 from ..tokens import Doc, Span
+from ..ml import empty_kb
+from ..tokens import Doc, Span, SpanGroup
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 from ..language import Language
@@ -505,26 +507,21 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                 continue
             sentences = [s for s in doc.sents]
 
-                if self.incl_context:
-                    # get n_neighbour sentences, clipped to the length of the document
-                    start_sentence = max(0, sent_index - self.n_sents)
-                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
-                    start_token = sentences[start_sentence].start
-                    end_token = sentences[end_sentence].end
-                    sent_doc = doc[start_token:end_token].as_doc()
-                    # currently, the context is the same for each entity in a sentence (should be refined)
-                    sentence_encoding = self.model.predict([sent_doc])[0]
-                    sentence_encoding_t = sentence_encoding.T
-                    sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                entity_count += 1
-                if ent.label_ in self.labels_discard:
-                    # ignoring this entity - setting to NIL
-                    final_kb_ids.append(self.NIL)
-                    self._add_activations(
-                        doc_scores=doc_scores,
-                        doc_ents=doc_ents,
-                        scores=[0.0],
-                        ents=[0],
+            # Loop over entities in batches.
+            for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
+                ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
+
+                # Look up candidate entities.
+                valid_ent_idx = [
+                    idx
+                    for idx in range(len(ent_batch))
+                    if ent_batch[idx].label_ not in self.labels_discard
+                ]
+
+                batch_candidates = list(
+                    self.get_candidates_batch(
+                        self.kb,
+                        SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
                     )
                 else:
                     candidates = list(self.get_candidates(self.kb, ent))
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 44f985956bd..a9205810945 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1089,7 +1089,6 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
-
     # Ensure that the legacy architectures still work
     vector_length = 3
     nlp = English()

From aec9e08eee0bbc4c5b8fc6d662cbf7bb095438d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 30 Mar 2023 09:30:42 +0200
Subject: [PATCH 383/504] Avoid `TrainablePipe.finish_update` getting called
 twice during training (#12450)

* Avoid `TrainablePipe.finish_update` getting called twice during training

PR #12136 fixed an issue where the tok2vec pipe was updated before
gradient were accumulated. However, it introduced a new bug that cause
`finish_update` to be called twice when using the training loop. This
causes a fairly large slowdown.

The `Language.update` method accepts the `sgd` argument for passing an
optimizer. This argument has three possible values:

- `Optimizer`: use the given optimizer to finish pipe updates.
- `None`: use a default optimizer to finish pipe updates.
- `False`: do not finish pipe updates.

However, the latter option was not documented and not valid with the
existing type of `sgd`. I assumed that this was a remnant of earlier
spaCy versions and removed handling of `False`.

However, with that change, we are passing `None` to `Language.update`.
As a result, we were calling `finish_update` in both `Language.update`
and in the training loop after all subbatches are processed.

This change restores proper handling/use of `False`. Moreover, the role
of `False` is now documented and added to the type to avoid future
accidents.

* Fix typo

* Document defaults for `Language.update`
---
 spacy/language.py            |  1 +
 spacy/tests/test_language.py | 18 ++++++++++++++++++
 spacy/training/loop.py       |  2 +-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index 913c2cff6b6..e9433633b89 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1299,6 +1299,7 @@ def update(
                 name not in exclude
                 and isinstance(proc, ty.TrainableComponent)
                 and proc.is_trainable
+                and sgd not in (None, False)
             ):
                 proc.finish_update(sgd)
 
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 3a3cdf4a353..44f39e1d2ba 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -162,6 +162,24 @@ def test_language_update_updates():
     )
 
 
+def test_language_update_does_not_update_with_sgd_false():
+    config = Config().from_str(TAGGER_CFG_STRING)
+    nlp = load_model_from_config(config, auto_fill=True, validate=True)
+
+    train_examples = []
+    for t in TAGGER_TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+    nlp.update(train_examples, sgd=False)
+    docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+
+    xp = get_array_module(docs_after_update[0].tensor)
+    xp.testing.assert_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
+
+
 def test_language_evaluate(nlp):
     text = "hello world"
     annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 362cdd1e959..0f8d561b9a6 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -469,7 +469,7 @@ def train_while_improving(
                 subbatch,
                 drop=dropout,
                 losses=losses,
-                sgd=None,
+                sgd=False,
                 exclude=exclude,
                 annotates=annotating_components,
             )

From f69dde32909a5044b9f8c65d2979c90abbdadf1f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 6 Apr 2023 16:01:59 +0200
Subject: [PATCH 384/504] Enforce that Span.start/end(_char) remain valid and
 in sync (#12268)

* Enforce that Span.start/end(_char) remain valid and in sync

Allowing span attributes to be writable starting in v3 has made it
possible for the internal `Span.start/end/start_char/end_char` to get
out-of-sync or have invalid values.

This checks that the values are valid and syncs the token and char
offsets if any attributes are modified directly. It does not yet handle
the case where the underlying doc is modified.

* Format
---
 spacy/errors.py              |  3 +++
 spacy/tests/doc/test_span.py | 47 ++++++++++++++++++++++++++++++++++
 spacy/tokens/span.pyx        | 49 +++++++++++++++++++++++++++---------
 3 files changed, 87 insertions(+), 12 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 776fdd40f7b..09f6d002d94 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -990,6 +990,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E4004 = ("Backprop is not supported when is_train is not set.")
     E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
     E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
+    E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
+             "{existing_value}.")
+    E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
 
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 15ea3614901..8452a5152aa 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -706,3 +706,50 @@ def test_span_ent_id(en_tokenizer):
     doc.ents = [span]
     assert doc.ents[0].ent_id_ == "ID2"
     assert doc[1].ent_id_ == "ID2"
+
+
+def test_span_start_end_sync(en_tokenizer):
+    doc = en_tokenizer("a bc def e fghij kl")
+    # can create and edit span starts/ends
+    span = doc[2:4]
+    span.start_char = 2
+    span.end = 5
+    assert span == doc[span.start : span.end]
+    assert span == doc.char_span(span.start_char, span.end_char)
+    # cannot set completely out of bounds starts/ends
+    with pytest.raises(IndexError):
+        span.start = -1
+    with pytest.raises(IndexError):
+        span.end = -1
+    with pytest.raises(IndexError):
+        span.start_char = len(doc.text) + 1
+    with pytest.raises(IndexError):
+        span.end = len(doc.text) + 1
+    # test all possible char starts/ends
+    span = doc[0 : len(doc)]
+    token_char_starts = [token.idx for token in doc]
+    token_char_ends = [token.idx + len(token.text) for token in doc]
+    for i in range(len(doc.text)):
+        if i not in token_char_starts:
+            with pytest.raises(ValueError):
+                span.start_char = i
+        else:
+            span.start_char = i
+    span = doc[0 : len(doc)]
+    for i in range(len(doc.text)):
+        if i not in token_char_ends:
+            with pytest.raises(ValueError):
+                span.end_char = i
+        else:
+            span.end_char = i
+    # start must be <= end
+    span = doc[1:3]
+    with pytest.raises(ValueError):
+        span.start = 4
+    with pytest.raises(ValueError):
+        span.end = 0
+    span = doc.char_span(2, 8)
+    with pytest.raises(ValueError):
+        span.start_char = 9
+    with pytest.raises(ValueError):
+        span.end_char = 1
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 9c44639343f..5d52e4fcfc9 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -856,9 +856,13 @@ cdef class Span:
             return self.span_c().start
 
         def __set__(self, int start):
-            if start < 0:
-                raise IndexError("TODO")
-            self.span_c().start = start
+            if start < 0 or start > self.doc.length:
+                raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
+            cdef SpanC* span_c = self.span_c()
+            if start > span_c.end:
+                raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
+            span_c.start = start
+            span_c.start_char = self.doc.c[start].idx
 
     property end:
         def __get__(self):
@@ -866,9 +870,16 @@ cdef class Span:
             return self.span_c().end
 
         def __set__(self, int end):
-            if end < 0:
-                raise IndexError("TODO")
-            self.span_c().end = end
+            if end < 0 or end > self.doc.length:
+                raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
+            cdef SpanC* span_c = self.span_c()
+            if span_c.start > end:
+                raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
+            span_c.end = end
+            if end > 0:
+                span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length
+            else:
+                span_c.end_char = 0
 
     property start_char:
         def __get__(self):
@@ -876,9 +887,16 @@ cdef class Span:
             return self.span_c().start_char
 
         def __set__(self, int start_char):
-            if start_char < 0:
-                raise IndexError("TODO")
-            self.span_c().start_char = start_char
+            if start_char < 0 or start_char > len(self.doc.text):
+                raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
+            cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
+            if start < 0:
+                raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
+            cdef SpanC* span_c = self.span_c()
+            if start_char > span_c.end_char:
+                raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char))
+            span_c.start_char = start_char
+            span_c.start = start
 
     property end_char:
         def __get__(self):
@@ -886,9 +904,16 @@ cdef class Span:
             return self.span_c().end_char
 
         def __set__(self, int end_char):
-            if end_char < 0:
-                raise IndexError("TODO")
-            self.span_c().end_char = end_char
+            if end_char < 0 or end_char > len(self.doc.text):
+                raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
+            cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
+            if end < 0:
+                raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
+            cdef SpanC* span_c = self.span_c()
+            if span_c.start_char > end_char:
+                raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char))
+            span_c.end_char = end_char
+            span_c.end = end
 
     property label:
         def __get__(self):

From 2968ccf73f91dcd1e01a7d0edb0dc066a1ec27b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 21 Apr 2023 13:49:40 +0200
Subject: [PATCH 385/504] Add distillation loop (#12542)

* Add distillation initialization and loop

* Fix up configuration keys

* Add docstring

* Type annotations

* init_nlp_distill -> init_nlp_student

* Do not resolve dot name distill corpus in initialization

(Since we don't use it.)

* student: do not request use of optimizer in student pipe

We apply finish up the updates once in the training loop instead.

Also add the necessary logic to `Language.distill` to mirror
`Language.update`.

* Correctly determine sort key in subdivide_batch

* Fix _distill_loop docstring wrt. stopping condition

* _distill_loop: fix distill_data docstring

Make similar changes in train_while_improving, since it also had
incorrect types and missing type annotations.

* Move `set_{gpu_allocator,seed}_from_config` to spacy.util

* Update Language.update docs for the sgd argument

* Type annotation

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/language.py | 26 ++++++++++++++++++++------
 spacy/schemas.py  |  2 +-
 spacy/util.py     |  1 +
 3 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index e9433633b89..f8a891ad515 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1049,7 +1049,7 @@ def distill(
         examples: Iterable[Example],
         *,
         drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
+        sgd: Union[Optimizer, None, Literal[False]] = None,
         losses: Optional[Dict[str, float]] = None,
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
         exclude: Iterable[str] = SimpleFrozenList(),
@@ -1062,7 +1062,9 @@ def distill(
             (teacher) and predicted (student) docs must have the same number of
             tokens and the same orthography.
         drop (float): The dropout rate.
-        sgd (Optional[Optimizer]): An optimizer.
+        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
+            be created via create_optimizer if 'None'. No optimizer will
+            be used when set to 'False'.
         losses (Optional(Dict[str, float])): Dictionary to update with the loss,
             keyed by component.
         component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
@@ -1132,11 +1134,23 @@ def distill(
                 student_proc.distill(
                     teacher_pipe,
                     examples,
-                    sgd=sgd,
+                    sgd=None,
                     losses=losses,
                     **component_cfg[student_name],
                 )
 
+        # Only finish the update after all component updates are done. Some
+        # components may share weights (such as tok2vec) and we only want
+        # to apply weight updates after all gradients are accumulated.
+        for student_name, student_proc in self.pipeline:
+            if (
+                student_name not in exclude
+                and isinstance(student_proc, ty.DistillableComponent)
+                and student_proc.is_distillable
+                and sgd not in (None, False)
+            ):
+                student_proc.finish_update(sgd)
+
         return losses
 
     def disable_pipes(self, *names) -> "DisabledPipes":
@@ -1905,7 +1919,7 @@ def from_config(
         # using the nlp.config with all defaults.
         config = util.copy_config(config)
         orig_pipeline = config.pop("components", {})
-        orig_distill = config.pop("distill", None)
+        orig_distill = config.pop("distillation", None)
         orig_pretraining = config.pop("pretraining", None)
         config["components"] = {}
         if auto_fill:
@@ -1915,8 +1929,8 @@ def from_config(
         filled["components"] = orig_pipeline
         config["components"] = orig_pipeline
         if orig_distill is not None:
-            filled["distill"] = orig_distill
-            config["distill"] = orig_distill
+            filled["distillation"] = orig_distill
+            config["distillation"] = orig_distill
         if orig_pretraining is not None:
             filled["pretraining"] = orig_pretraining
             config["pretraining"] = orig_pretraining
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 7418ede3226..cf9d3064065 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -518,7 +518,7 @@ class Config:
     "training": ConfigSchemaTraining,
     "pretraining": ConfigSchemaPretrain,
     "initialize": ConfigSchemaInit,
-    "distill": ConfigSchemaDistill,
+    "distillation": ConfigSchemaDistill,
 }
 
 # Recommendations for init config workflows
diff --git a/spacy/util.py b/spacy/util.py
index ae1135234e3..624fffe865d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -7,6 +7,7 @@
 import thinc
 from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
 from thinc.api import ConfigValidationError, Model, constant as constant_schedule
+from thinc.api import fix_random_seed, set_gpu_allocator
 import functools
 import itertools
 import logging

From 755be61e352df087cb68a0a502090eb3e251a567 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 12 Jun 2023 16:16:03 +0200
Subject: [PATCH 386/504] Remove Python 3.7 builds

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index a68c159d643..29420430aab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<0.1000; platform_machine != "aarch64"
+mypy>=0.990,<1.1.0; platform_machine != "aarch64"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests

From 639241a612d931b2c108820d1d01073a68c7f9c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 12 Jun 2023 16:43:05 +0200
Subject: [PATCH 387/504] spancat type fixes

---
 spacy/pipeline/spancat.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index c2e4d29d065..f817ad6f94f 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -528,10 +528,9 @@ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> Non
         indices = activations["indices"]
         assert isinstance(indices, Ragged)
         scores = cast(Floats2d, activations["scores"])
-
         offset = 0
         for i, doc in enumerate(docs):
-            indices_i = indices[i].dataXd
+            indices_i = cast(Ints2d, indices[i].dataXd)
             if self.save_activations:
                 doc.activations[self.name] = {}
                 doc.activations[self.name]["indices"] = indices_i

From 9ce16bd9ba3e0ed57f9e97d98275726c42bd03c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 23 Jun 2023 09:43:41 +0200
Subject: [PATCH 388/504] Set version to v4.0.0.dev1 (#12748)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index eb85e6af388..73f201af5fb 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "4.0.0.dev0"
+__version__ = "4.0.0.dev1"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 0f94b12e0c149891324ebe14c6bbb276e3f0efb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 11:41:03 +0200
Subject: [PATCH 389/504] isort all the things

---
 spacy/cli/_util.py                            | 11 ----
 spacy/cli/convert.py                          |  2 -
 spacy/cli/debug_data.py                       |  8 ---
 spacy/cli/download.py                         | 12 +++-
 spacy/cli/init_config.py                      | 13 +++-
 spacy/cli/init_pipeline.py                    | 13 +++-
 spacy/errors.py                               |  1 -
 spacy/language.py                             | 59 ++++++++++---------
 spacy/lexeme.pxd                              | 17 ++++--
 spacy/matcher/matcher.pyi                     | 17 +++++-
 spacy/matcher/matcher.pyx                     |  4 --
 spacy/matcher/phrasematcher.pyi               |  6 +-
 spacy/matcher/phrasematcher.pyx               |  5 +-
 spacy/ml/models/parser.py                     |  9 +--
 spacy/ml/models/tok2vec.py                    |  2 -
 spacy/ml/tb_framework.pyx                     | 37 ++++++++----
 spacy/morphology.pxd                          |  2 +-
 spacy/morphology.pyx                          |  6 +-
 .../_parser_internals/_beam_utils.pxd         |  1 +
 spacy/pipeline/_parser_internals/_state.pxd   |  2 -
 .../pipeline/_parser_internals/arc_eager.pyx  |  1 +
 spacy/pipeline/_parser_internals/search.pxd   |  6 +-
 spacy/pipeline/_parser_internals/search.pyx   |  5 +-
 spacy/pipeline/attribute_ruler.py             |  2 +-
 spacy/pipeline/dep_parser.py                  |  7 ---
 spacy/pipeline/edit_tree_lemmatizer.py        |  8 +--
 spacy/pipeline/ner.py                         | 14 -----
 spacy/pipeline/sentencizer.pyx                |  1 +
 spacy/pipeline/senter.pyx                     | 10 +---
 spacy/pipeline/span_ruler.py                  |  8 ---
 spacy/pipeline/spancat.py                     | 20 +++++--
 spacy/pipeline/textcat.py                     |  4 --
 spacy/pipeline/textcat_multilabel.py          |  4 --
 spacy/pipeline/tok2vec.py                     |  3 -
 spacy/pipeline/transition_parser.pyx          | 52 +++++++++-------
 spacy/schemas.py                              |  9 ---
 spacy/strings.pxd                             |  3 -
 spacy/strings.pyi                             |  3 +-
 spacy/strings.pyx                             |  6 +-
 spacy/tests/doc/test_span.py                  |  1 -
 spacy/tests/doc/test_underscore.py            |  1 +
 spacy/tests/parser/_search.pyx                |  7 ++-
 spacy/tests/parser/test_ner.py                |  2 -
 spacy/tests/parser/test_parse.py              | 10 ++--
 .../pipeline/test_edit_tree_lemmatizer.py     |  2 -
 spacy/tests/pipeline/test_entity_linker.py    |  2 +-
 spacy/tests/pipeline/test_entity_ruler.py     |  6 --
 spacy/tests/pipeline/test_morphologizer.py    |  3 +-
 spacy/tests/pipeline/test_senter.py           |  1 +
 spacy/tests/pipeline/test_spancat.py          |  7 ++-
 spacy/tests/pipeline/test_tagger.py           |  3 +-
 spacy/tests/pipeline/test_textcat.py          | 17 +++---
 .../tests/serialize/test_serialize_config.py  | 25 +++++---
 .../serialize/test_serialize_pipeline.py      | 11 +++-
 spacy/tests/test_language.py                  | 14 +++--
 spacy/tests/test_misc.py                      | 20 ++++---
 spacy/tests/test_symbols.py                   |  1 +
 spacy/tests/training/test_training.py         | 15 +++--
 spacy/tokenizer.pxd                           |  4 --
 spacy/tokenizer.pyx                           |  6 --
 spacy/tokens/__init__.py                      |  3 +-
 spacy/tokens/doc.pyi                          |  5 +-
 spacy/tokens/doc.pyx                          | 15 +----
 spacy/tokens/doc_bin.py                       |  4 +-
 spacy/tokens/morphanalysis.pxd                |  7 ++-
 spacy/tokens/morphanalysis.pyx                |  4 --
 spacy/tokens/retokenizer.pyx                  |  3 +
 spacy/tokens/span.pxd                         |  1 -
 spacy/tokens/token.pyx                        |  1 +
 spacy/training/__init__.py                    |  3 -
 spacy/training/batchers.py                    | 13 ++++
 spacy/training/callbacks.py                   |  6 +-
 spacy/training/converters/json_to_docs.py     | 12 ++--
 spacy/training/example.pyx                    |  1 +
 spacy/ty.py                                   | 16 ++++-
 spacy/util.py                                 | 11 +---
 76 files changed, 341 insertions(+), 315 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 644f3e5ef24..eed61119070 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,10 +1,3 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
-from typing import TYPE_CHECKING, overload
-import sys
-import shutil
-from pathlib import Path
-from wasabi import msg, Printer
-import srsly
 import hashlib
 import os
 import shutil
@@ -34,10 +27,6 @@
 from typer.main import get_command
 from wasabi import Printer, msg
 
-from ..schemas import ProjectConfigSchema, validate
-from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
-from ..errors import RENAMED_LANGUAGE_CODES
 from .. import about
 from ..errors import RENAMED_LANGUAGE_CODES
 from ..schemas import ProjectConfigSchema, validate
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 19591a05c94..a282e59c749 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -8,8 +8,6 @@
 import srsly
 from wasabi import Printer
 
-from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
-from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training import docs_to_json
 from ..training.converters import (
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 60f760ccb52..7a98e6d563c 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,11 +1,3 @@
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
-from typing import Literal, cast, overload
-from pathlib import Path
-from collections import Counter
-import sys
-import srsly
-from wasabi import Printer, MESSAGES, msg
-import typer
 import math
 import sys
 from collections import Counter
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index ea7a06c5417..0635522930b 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,9 +7,15 @@
 from wasabi import msg
 
 from .. import about
-from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version, get_installed_models
-from ..util import get_package_version
+from ..util import (
+    get_installed_models,
+    get_minor_version,
+    get_package_version,
+    is_package,
+    is_prerelease_version,
+    run_command,
+)
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
 
 
 @app.command(
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index b29a2b748f2..ca0c316ca20 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -12,9 +12,16 @@
 from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
-from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
-from ._util import string_to_list, import_code, _handle_renamed_language_codes
-
+from ._util import (
+    COMMAND,
+    Arg,
+    Opt,
+    _handle_renamed_language_codes,
+    import_code,
+    init_cli,
+    show_validation_error,
+    string_to_list,
+)
 
 ROOT = Path(__file__).parent / "templates"
 TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 1a044dedbc9..991dc1a822c 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -8,8 +8,17 @@
 
 from .. import util
 from ..language import Language
-from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu, _handle_renamed_language_codes
+from ..training.initialize import convert_vectors, init_nlp
+from ._util import (
+    Arg,
+    Opt,
+    _handle_renamed_language_codes,
+    import_code,
+    init_cli,
+    parse_config_overrides,
+    setup_gpu,
+    show_validation_error,
+)
 
 
 @init_cli.command("vectors")
diff --git a/spacy/errors.py b/spacy/errors.py
index 09f6d002d94..15ef239ea5b 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,4 +1,3 @@
-from typing import Literal
 import warnings
 from typing import Literal
 
diff --git a/spacy/language.py b/spacy/language.py
index f8a891ad515..72d27c598cc 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,10 +1,3 @@
-from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Literal
-from typing import Union, Tuple, List, Set, Pattern, Sequence
-from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
-
-from dataclasses import dataclass
-import random
-import itertools
 import functools
 import itertools
 import multiprocessing as mp
@@ -37,29 +30,41 @@
     overload,
 )
 
-from . import ty
-from .tokens.underscore import Underscore
-from .vocab import Vocab, create_vocab
-from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .training import Example, validate_examples, validate_distillation_examples
-from .training.initialize import init_vocab, init_tok2vec
-from .scorer import Scorer
-from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
-from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
-from .util import warn_if_jupyter_cupy
-from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
-from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
-from .lang.punctuation import TOKENIZER_INFIXES
-from .tokens import Doc
-from .tokenizer import Tokenizer
+import srsly
+from thinc.api import Config, CupyOps, Optimizer, get_current_ops
+
+from . import about, ty, util
 from .errors import Errors, Warnings
-from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
-from .schemas import ConfigSchemaPretrain, validate_init_settings
 from .git_info import GIT_VERSION
-from . import util
-from . import about
+from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .lang.tokenizer_exceptions import BASE_EXCEPTIONS, URL_MATCH
 from .lookups import load_lookups
-
+from .pipe_analysis import analyze_pipes, print_pipe_analysis, validate_attrs
+from .schemas import (
+    ConfigSchema,
+    ConfigSchemaInit,
+    ConfigSchemaNlp,
+    ConfigSchemaPretrain,
+    validate_init_settings,
+)
+from .scorer import Scorer
+from .tokenizer import Tokenizer
+from .tokens import Doc
+from .tokens.underscore import Underscore
+from .training import Example, validate_distillation_examples, validate_examples
+from .training.initialize import init_tok2vec, init_vocab
+from .util import (
+    _DEFAULT_EMPTY_PIPES,
+    CONFIG_SECTION_ORDER,
+    SimpleFrozenDict,
+    SimpleFrozenList,
+    _pipe,
+    combine_score_weights,
+    raise_error,
+    registry,
+    warn_if_jupyter_cupy,
+)
+from .vocab import Vocab, create_vocab
 
 PipeCallable = Callable[[Doc], Doc]
 
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 2d14edcd6b0..ff51d77e8a9 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,10 +1,19 @@
 from numpy cimport ndarray
 
-from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
-from .attrs cimport attr_id_t
-from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
-
+from .attrs cimport (
+    ID,
+    LANG,
+    LENGTH,
+    LOWER,
+    NORM,
+    ORTH,
+    PREFIX,
+    SHAPE,
+    SUFFIX,
+    attr_id_t,
+)
 from .structs cimport LexemeC
+from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
 from .vocab cimport Vocab
 
 
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index a0b6d91e7d5..fe2d8bec3bc 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -1,6 +1,17 @@
-from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Literal
-from typing import Iterator, Iterable, overload
-from ..vocab import Vocab
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+    overload,
+)
+
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 73d60767b2f..8accd8c4465 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -32,10 +32,6 @@ from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
 from ..vocab cimport Vocab
 
-from ..schemas import validate_token_pattern
-from ..errors import Errors, MatchPatternError, Warnings
-from ..strings cimport get_string_id
-from ..attrs import IDS
 from ..errors import Errors, MatchPatternError, Warnings
 from ..schemas import validate_token_pattern
 from .levenshtein import levenshtein_compare
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index 45685db228a..d3c679a65d5 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -1,7 +1,5 @@
-from typing import List, Tuple, Union, Optional, Callable, Any, Dict, Literal
-from typing import overload
-from .matcher import Matcher
-from ..vocab import Vocab
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, overload
+
 from ..tokens import Doc, Span
 from ..vocab import Vocab
 from .matcher import Matcher
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index f36c93e8f32..5c79cda6c1e 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,8 +1,9 @@
 # cython: infer_types=True, profile=True
-from typing import List
 from collections import defaultdict
+from typing import List
+
 from libc.stdint cimport uintptr_t
-from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
+from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
 
 import warnings
 
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 01312983d86..422abf4e260 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,12 +1,13 @@
-from typing import Optional, List, Tuple, Any, Literal
-from thinc.types import Floats2d
-from thinc.api import Model
 import warnings
+from typing import Any, List, Literal, Optional, Tuple
+
+from thinc.api import Model
+from thinc.types import Floats2d
 
 from ...errors import Errors, Warnings
+from ...tokens.doc import Doc
 from ...util import registry
 from ..tb_framework import TransitionModel
-from ...tokens.doc import Doc
 
 TransitionSystem = Any  # TODO
 State = Any  # TODO
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 9372a665f2c..8e59e34c053 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -22,8 +22,6 @@
 from ...attrs import intify_attr
 from ...errors import Errors
 from ...ml import character_embed
-from ..staticvectors import StaticVectors
-from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
 from ...tokens import Doc
 from ...util import registry
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 9b2114900d3..fd0af12ceab 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -1,28 +1,45 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import List, Tuple, Any, Optional, TypeVar, cast
-from libc.string cimport memset, memcpy
+from typing import Any, List, Optional, Tuple, TypeVar, cast
+
 from libc.stdlib cimport calloc, free, realloc
+from libc.string cimport memcpy, memset
 from libcpp.vector cimport vector
+
 import numpy
+
 cimport numpy as np
-from thinc.api import Model, normal_init, chain, list2array, Linear
-from thinc.api import uniform_init, glorot_uniform_init, zero_init
-from thinc.api import NumpyOps
+
+from thinc.api import (
+    Linear,
+    Model,
+    NumpyOps,
+    chain,
+    glorot_uniform_init,
+    list2array,
+    normal_init,
+    uniform_init,
+    zero_init,
+)
+
 from thinc.backends.cblas cimport CBlas, saxpy, sgemm
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
-from thinc.types import Ints1d, Ints2d
+
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
 
 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
 from ..pipeline._parser_internals.batch import GreedyBatch
+
 from ..pipeline._parser_internals._parser_utils cimport arg_max
-from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
-from ..pipeline._parser_internals.transition_system cimport TransitionSystem
 from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..pipeline._parser_internals.transition_system cimport (
+    TransitionSystem,
+    c_apply_actions,
+    c_transition_batch,
+)
+
 from ..tokens.doc import Doc
 from ..util import registry
 
-
 State = Any  # TODO
 
 
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 7f833e96df1..122ab4f1ab4 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,8 +1,8 @@
 cimport numpy as np
 from libc.stdint cimport uint32_t, uint64_t
+from libcpp.memory cimport shared_ptr
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.memory cimport shared_ptr
 
 from .strings cimport StringStore
 from .structs cimport MorphAnalysisC
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index a7d1c51eab6..a6abf8abec7 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,10 +1,12 @@
 # cython: infer_types
 import warnings
-from typing import Union, Tuple, List, Dict, Optional
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy
+
 from cython.operator cimport dereference as deref
 from libcpp.memory cimport shared_ptr
 
-from .errors import Warnings
 from . import symbols
 from .errors import Warnings
 
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pxd b/spacy/pipeline/_parser_internals/_beam_utils.pxd
index 571f246b1e3..5a452e56a88 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd
@@ -1,5 +1,6 @@
 from ...typedefs cimport class_t, hash_t
 
+
 # These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
 
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 1b6b25e5f16..1c61ac271d8 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,5 +1,4 @@
 cimport libcpp
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as incr
 from libc.stdint cimport uint32_t, uint64_t
@@ -8,7 +7,6 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 66d6fb9acf5..462aa820e4f 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -18,6 +18,7 @@ from ._state cimport ArcC, StateC
 from .stateclass cimport StateClass
 
 from ...errors import Errors
+
 from .search cimport Beam
 
 
diff --git a/spacy/pipeline/_parser_internals/search.pxd b/spacy/pipeline/_parser_internals/search.pxd
index de6a887bed5..54db7154d13 100644
--- a/spacy/pipeline/_parser_internals/search.pxd
+++ b/spacy/pipeline/_parser_internals/search.pxd
@@ -1,12 +1,10 @@
 from cymem.cymem cimport Pool
-
-from libc.stdint cimport uint32_t
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint32_t, uint64_t
 from libcpp.pair cimport pair
 from libcpp.queue cimport priority_queue
 from libcpp.vector cimport vector
 
-from ...typedefs cimport class_t, weight_t, hash_t
+from ...typedefs cimport class_t, hash_t, weight_t
 
 ctypedef pair[weight_t, size_t] Entry
 ctypedef priority_queue[Entry] Queue
diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
index 1d9b6dd7adf..251eaa805cb 100644
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -1,7 +1,8 @@
 # cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
-from libc.string cimport memset, memcpy
-from libc.math cimport log, exp
+from libc.math cimport exp, log
+from libc.string cimport memcpy, memset
+
 import math
 
 from cymem.cymem cimport Pool
diff --git a/spacy/pipeline/attribute_ruler.py b/spacy/pipeline/attribute_ruler.py
index 126a48945bc..76f82b84e38 100644
--- a/spacy/pipeline/attribute_ruler.py
+++ b/spacy/pipeline/attribute_ruler.py
@@ -11,7 +11,7 @@
 from ..symbols import IDS
 from ..tokens import Doc, Span
 from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs
-from ..vocab import Vocab
+from ..training import Example
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
 from .pipe import Pipe
diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.py
index c996074d2c4..b4961487b83 100644
--- a/spacy/pipeline/dep_parser.py
+++ b/spacy/pipeline/dep_parser.py
@@ -4,13 +4,6 @@
 
 from thinc.api import Config, Model
 
-from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
-from ._parser_internals.arc_eager import ArcEager
-
-from ._parser_internals.arc_eager cimport ArcEager
-from .transition_parser cimport Parser
-
 from ..language import Language
 from ..scorer import Scorer
 from ..training import remove_bilu_prefix
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index d5366d7d10f..a93a6c676c2 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -4,9 +4,9 @@
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model
-from thinc.types import ArrayXd, Floats2d, Ints1d
+from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
 from ..errors import Errors
@@ -19,10 +19,6 @@
 from .lemmatizer import lemmatizer_score
 from .trainable_pipe import TrainablePipe
 
-# The cutoff value of *top_k* above which an alternative method is used to process guesses.
-TOP_K_GUARDRAIL = 20
-
-
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
 
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.py
index 41280c49390..1c7cf151385 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.py
@@ -11,20 +11,6 @@
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
 from .transition_parser import Parser
-from ._parser_internals.ner import BiluoPushDown
-from ..language import Language
-from ..scorer import get_ner_prf, PRFScore
-from ..training import validate_examples
-from ..util import registry
-from ..training import remove_bilu_prefix
-
-from ._parser_internals.ner cimport BiluoPushDown
-from .transition_parser cimport Parser
-
-from ..language import Language
-from ..scorer import get_ner_prf
-from ..training import remove_bilu_prefix
-from ..util import registry
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 9087bff0cdc..6dd62ed8577 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -11,6 +11,7 @@ from ..scorer import Scorer
 from .pipe import Pipe
 from .senter import senter_score
 
+
 @Language.factory(
     "sentencizer",
     assigns=["token.is_sent_start", "doc.sents"],
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index e83dd789152..646166c329c 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,25 +1,21 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Dict, Iterable, Optional, Callable, List, Union
 from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Union
 
 import srsly
-from thinc.api import Model, Config
+from thinc.api import Config, Model
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
-
 from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
 
-from .tagger import ActivationsT, Tagger
-from ..language import Language
+from .. import util
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import Tagger
-
+from .tagger import ActivationsT, Tagger
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index 3f876598013..cd8fea36b47 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -17,14 +17,6 @@
 
 import srsly
 
-from .pipe import Pipe
-from ..training import Example
-from ..language import Language
-from ..errors import Errors, Warnings
-from ..util import ensure_path, SimpleFrozenList, registry
-from ..tokens import Doc, Span
-from ..scorer import Scorer, get_ner_prf
-from ..matcher import Matcher, PhraseMatcher
 from .. import util
 from ..errors import Errors, Warnings
 from ..language import Language
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index f817ad6f94f..9d9415692a8 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,8 +1,18 @@
-from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
-from typing import Union, Protocol, runtime_checkable
-from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
-from thinc.api import Optimizer
-from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
+from dataclasses import dataclass
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Protocol,
+    Tuple,
+    Union,
+    cast,
+    runtime_checkable,
+)
 
 import numpy
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 79a98b9bc5f..13841dd7bbb 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,7 +1,3 @@
-from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
-from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
-from thinc.types import Floats2d
-import numpy
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 4c165e02b03..e1c1fdc7a34 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,7 +1,3 @@
-from typing import Iterable, Optional, Dict, List, Callable, Any, Union
-from thinc.types import Floats2d
-from thinc.api import Model, Config
-
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 0be04232caa..92aec22b7a7 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -1,6 +1,3 @@
-from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple
-from thinc.api import Model, set_dropout_rate, Optimizer, Config
-from thinc.types import Floats2d
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
 
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 815f6cc4400..463b54f5fdd 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,49 +1,61 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
+
 from typing import Dict, Iterable, List, Optional, Tuple
-from cymem.cymem cimport Pool
+
 cimport numpy as np
 from cymem.cymem cimport Pool
 
 from itertools import islice
 
 from libc.stdlib cimport calloc, free
-from libc.string cimport memset
+from libc.string cimport memcpy, memset
 from libcpp.vector cimport vector
 
-import random
 import contextlib
+import random
+import warnings
 
-import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
-from thinc.api import chain, softmax_activation, use_ops, get_array_module
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
-import numpy.random
 import numpy
 import numpy.random
 import srsly
-from thinc.api import CupyOps, NumpyOps, set_dropout_rate
+from thinc.api import (
+    CupyOps,
+    NumpyOps,
+    Optimizer,
+    chain,
+    get_array_module,
+    get_ops,
+    set_dropout_rate,
+    softmax_activation,
+    use_ops,
+)
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d, Ints1d
 
 from ..ml.tb_framework import TransitionModelInputs
-from ._parser_internals.stateclass cimport StateC, StateClass
-from ._parser_internals.search cimport Beam
+
 from ..tokens.doc cimport Doc
-from .trainable_pipe cimport TrainablePipe
 from ._parser_internals cimport _beam_utils
+from ._parser_internals.search cimport Beam
+from ._parser_internals.stateclass cimport StateC, StateClass
+from .trainable_pipe cimport TrainablePipe
+
 from ._parser_internals import _beam_utils
+
+from ..typedefs cimport weight_t
 from ..vocab cimport Vocab
 from ._parser_internals.transition_system cimport Transition, TransitionSystem
-from ..typedefs cimport weight_t
 
-from ..training import validate_examples, validate_get_examples
-from ..training import validate_distillation_examples
-from ..errors import Errors, Warnings
 from .. import util
-from ..errors import Errors
-from ..training import validate_examples, validate_get_examples
-from ._parser_internals import _beam_utils
+from ..errors import Errors, Warnings
+from ..training import (
+    validate_distillation_examples,
+    validate_examples,
+    validate_get_examples,
+)
+
 
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
diff --git a/spacy/schemas.py b/spacy/schemas.py
index cf9d3064065..6b41bb5b2b7 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,12 +1,3 @@
-from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
-from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
-from enum import Enum
-from pydantic import BaseModel, Field, ValidationError, validator, create_model
-from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
-from pydantic.main import ModelMetaclass
-from thinc.api import Optimizer, ConfigValidationError, Model
-from thinc.config import Promise
-from collections import defaultdict
 import inspect
 import re
 from collections import defaultdict
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index 688dbc46261..c05731c9a15 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,6 +1,3 @@
-from libc.stdint cimport int64_t, uint32_t
-from libcpp.vector cimport vector
-from libcpp.set cimport set
 from cymem.cymem cimport Pool
 from libc.stdint cimport int64_t, uint32_t
 from libcpp.set cimport set
diff --git a/spacy/strings.pyi b/spacy/strings.pyi
index 64dceeb7726..768005aa2be 100644
--- a/spacy/strings.pyi
+++ b/spacy/strings.pyi
@@ -1,6 +1,5 @@
-from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overload
 from pathlib import Path
-from typing import Any, Iterable, Iterator, Optional, Union, overload
+from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union, overload
 
 class StringStore:
     def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ...
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index a20e07a9482..3f01017d8f7 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,7 +1,10 @@
 # cython: infer_types=True
-from typing import Optional, Union, Iterable, Tuple, Callable, Any, List, Iterator
+from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
+
 cimport cython
 from libc.stdint cimport uint32_t
+from libc.string cimport memcpy
+from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 import srsly
@@ -14,7 +17,6 @@ from .symbols import IDS as SYMBOLS_BY_STR
 from .symbols import NAMES as SYMBOLS_BY_INT
 
 
-
 cdef class StringStore:
     """Look up strings by 64-bit hashes. Implicitly handles reserved symbols.
     """Look up strings by 64-bit hashes. Implicitly handles reserved symbols.
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 8452a5152aa..ccea3d120cf 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -6,7 +6,6 @@
 from spacy.attrs import LENGTH, ORTH
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span, SpanGroup, Token
-from spacy.vocab import Vocab
 from spacy.util import filter_spans
 from spacy.vocab import Vocab
 
diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index 9c73907780c..c9ed4a8bdaa 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -4,6 +4,7 @@
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
+
 # Helper functions
 def _get_tuple(s: Span):
     return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
index 23fc8164412..0983159b75d 100644
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@@ -1,11 +1,14 @@
 # cython: infer_types=True, binding=True
+from cymem.cymem cimport Pool
+
 from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
 from spacy.typedefs cimport class_t, weight_t
-from cymem.cymem cimport Pool
 
-from ..conftest import cytest
 import pytest
 
+from ..conftest import cytest
+
+
 cdef struct TestState:
     int length
     int x
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index d9cbf5e8c72..f0efc3a63fd 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,8 +17,6 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
-from thinc.api import fix_random_seed
-import logging
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 6f0e6b19841..636bb887789 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,17 +1,19 @@
 import itertools
-import pytest
+
 import numpy
+import pytest
 from numpy.testing import assert_equal
 from thinc.api import Adam, fix_random_seed
 
 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.training import Example
+from spacy.pipeline import DependencyParser
+from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.tokens import Doc
+from spacy.training import Example
 from spacy.vocab import Vocab
-from spacy import util, registry
-from thinc.api import fix_random_seed
 
 from ..util import apply_transition_sequence, make_tempdir
 
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index e423965bedc..0f925c0d4e1 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -1,4 +1,3 @@
-from typing import cast
 import pickle
 from typing import cast
 
@@ -11,7 +10,6 @@
 from spacy.language import Language
 from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
 from spacy.pipeline.trainable_pipe import TrainablePipe
-from spacy.training import Example
 from spacy.strings import StringStore
 from spacy.training import Example
 from spacy.util import make_tempdir
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index a9205810945..a3ab80f7ee0 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,4 +1,4 @@
-from typing import Callable, Iterable, Dict, Any, cast
+from typing import Any, Callable, Dict, Iterable, cast
 
 import pytest
 from numpy.testing import assert_equal
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 74731140688..12f2c9def2d 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -2,12 +2,6 @@
 from thinc.api import NumpyOps, get_current_ops
 
 from spacy import registry
-from spacy.tokens import Doc, Span
-from spacy.language import Language
-from spacy.lang.en import English
-from spacy.pipeline import EntityRecognizer, merge_entities
-from spacy.pipeline import SpanRuler
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
 from spacy.lang.en import English
 from spacy.language import Language
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index bf2eea8a94e..9a6bbc9fc60 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -1,4 +1,5 @@
 from typing import cast
+
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import get_current_ops
@@ -9,7 +10,7 @@
 from spacy.language import Language
 from spacy.morphology import Morphology
 from spacy.pipeline import TrainablePipe
-from spacy.attrs import MORPH
+from spacy.tests.util import make_tempdir
 from spacy.tokens import Doc
 from spacy.training import Example
 
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index a594c10b04c..9a798eae890 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -1,4 +1,5 @@
 from typing import cast
+
 import pytest
 from numpy.testing import assert_equal
 
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 5dcc2e70f67..42eb90a1bb1 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -1,6 +1,7 @@
 import numpy
-from numpy.testing import assert_array_equal, assert_almost_equal
-from thinc.api import get_current_ops, Ragged, fix_random_seed
+import pytest
+from numpy.testing import assert_almost_equal, assert_array_equal
+from thinc.api import NumpyOps, Ragged, fix_random_seed, get_current_ops
 
 from spacy import util
 from spacy.lang.en import English
@@ -8,7 +9,7 @@
 from spacy.tokens import SpanGroup
 from spacy.tokens.span_groups import SpanGroups
 from spacy.training import Example
-from spacy.util import registry, make_tempdir
+from spacy.util import make_tempdir, registry
 
 OPS = get_current_ops()
 
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index b6f94f7f97b..05e814f0733 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,4 +1,5 @@
 from typing import cast
+
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import compounding, get_current_ops
@@ -8,7 +9,7 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
-from thinc.api import compounding
+from spacy.training import Example
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index a54bf394608..f834597fafe 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,4 +1,3 @@
-from typing import cast
 import random
 from typing import cast
 
@@ -14,12 +13,16 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TextCategorizer, TrainablePipe
-from spacy.pipeline.textcat import single_label_bow_config
-from spacy.pipeline.textcat import single_label_cnn_config
-from spacy.pipeline.textcat import single_label_default_config
-from spacy.pipeline.textcat_multilabel import multi_label_bow_config
-from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
-from spacy.pipeline.textcat_multilabel import multi_label_default_config
+from spacy.pipeline.textcat import (
+    single_label_bow_config,
+    single_label_cnn_config,
+    single_label_default_config,
+)
+from spacy.pipeline.textcat_multilabel import (
+    multi_label_bow_config,
+    multi_label_cnn_config,
+    multi_label_default_config,
+)
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, DocBin
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index eb0dcc1e38c..646ce0f5d48 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -5,14 +5,25 @@
 import spacy
 from spacy.lang.de import German
 from spacy.lang.en import English
-from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
-from spacy.language import DEFAULT_CONFIG_DISTILL_PATH
-from spacy.language import Language
-from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
-from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
+from spacy.language import (
+    DEFAULT_CONFIG,
+    DEFAULT_CONFIG_DISTILL_PATH,
+    DEFAULT_CONFIG_PRETRAIN_PATH,
+    Language,
+)
+from spacy.ml.models import (
+    MaxoutWindowEncoder,
+    MultiHashEmbed,
+    build_tb_parser_model,
+    build_Tok2Vec_model,
+)
 from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
-from spacy.util import load_config, load_config_from_str
-from spacy.util import load_model_from_config, registry
+from spacy.util import (
+    load_config,
+    load_config_from_str,
+    load_model_from_config,
+    registry,
+)
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 91f6098255e..dd3d32571b1 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -8,9 +8,14 @@
 from spacy import Vocab, load, registry
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import DependencyParser, EntityRecognizer
-from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer
-from spacy.pipeline import TrainablePipe
+from spacy.pipeline import (
+    DependencyParser,
+    EntityRecognizer,
+    SentenceRecognizer,
+    Tagger,
+    TextCategorizer,
+    TrainablePipe,
+)
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 44f39e1d2ba..25352d2bb16 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -13,12 +13,14 @@
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, Span
 from spacy.training import Example
-from spacy.lang.en import English
-from spacy.lang.de import German
-from spacy.util import registry, ignore_error, raise_error, find_matching_language
-from spacy.util import load_model_from_config
-import spacy
-from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
+from spacy.util import (
+    find_matching_language,
+    ignore_error,
+    load_model_from_config,
+    raise_error,
+    registry,
+)
+from spacy.vocab import Vocab
 
 from .util import add_vecs_to_vocab, assert_docs_equal
 
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 160e2ecf335..c05ef625e11 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,13 +1,19 @@
 import ctypes
 import os
 from pathlib import Path
-from spacy.about import __version__ as spacy_version
-from spacy import util
-from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
-from spacy.util import find_available_port
-from thinc.api import Config, Optimizer, ConfigValidationError
-from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
+
+import pytest
+from pydantic import ValidationError
+from thinc.api import (
+    Config,
+    ConfigValidationError,
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+    set_current_ops,
+)
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
diff --git a/spacy/tests/test_symbols.py b/spacy/tests/test_symbols.py
index fb034accac2..2c2fcef755e 100644
--- a/spacy/tests/test_symbols.py
+++ b/spacy/tests/test_symbols.py
@@ -1,4 +1,5 @@
 import pytest
+
 from spacy.symbols import IDS, NAMES
 
 V3_SYMBOLS = {
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index ef20ec365c6..e8a19947606 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -8,10 +8,17 @@
 import spacy
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
-from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
-from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
-from spacy.training import offsets_to_biluo_tags, validate_distillation_examples
-from spacy.training.alignment_array import AlignmentArray
+from spacy.training import (
+    Alignment,
+    Corpus,
+    Example,
+    biluo_tags_to_offsets,
+    biluo_tags_to_spans,
+    docs_to_json,
+    iob_to_biluo,
+    offsets_to_biluo_tags,
+    validate_distillation_examples,
+)
 from spacy.training.align import get_alignments
 from spacy.training.alignment_array import AlignmentArray
 from spacy.training.converters import json_to_docs
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 2610532b75d..b2e50969462 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -2,10 +2,6 @@ from cymem.cymem cimport Pool
 from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 
-from .typedefs cimport hash_t
-from .structs cimport LexemeC, SpanC, TokenC
-from .tokens.doc cimport Doc
-from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
 from .structs cimport LexemeC, SpanC, TokenC
 from .tokens.doc cimport Doc
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 88cd0f37dd0..8485a57c8aa 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -9,17 +9,11 @@ from preshed.maps cimport PreshMap
 
 import re
 
-from .tokens.doc cimport Doc
-from .strings cimport hash_string
 from .lexeme cimport EMPTY_LEXEME
 from .strings cimport hash_string
 from .tokens.doc cimport Doc
 
-from .attrs import intify_attrs
-from .symbols import ORTH, NORM
-from .errors import Errors
 from . import util
-from .util import get_words_and_spaces
 from .attrs import intify_attrs
 from .errors import Errors
 from .scorer import Scorer
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index 16c43485340..7617e462fde 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -4,7 +4,6 @@
 from .morphanalysis import MorphAnalysis
 from .span import Span
 from .span_group import SpanGroup
-from .doc_bin import DocBin
-from .morphanalysis import MorphAnalysis
+from .token import Token
 
 __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index e62854f77ab..dc7c0143029 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -16,10 +16,7 @@ from typing import (
 import numpy as np
 from cymem.cymem import Pool
 from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
-from .span import Span
-from .token import Token
-from .span_groups import SpanGroups
-from .retokenizer import Retokenizer
+
 from ..lexeme import Lexeme
 from ..vocab import Vocab
 from .retokenizer import Retokenizer
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 3c7b728f41b..ff1a0d310d1 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -20,15 +20,8 @@ from thinc.util import copy_array
 
 from .span cimport Span
 from .token cimport MISSING_DEP
-from .span_groups import SpanGroups
-from .token cimport Token
-from ..lexeme cimport Lexeme, EMPTY_LEXEME
-from ..typedefs cimport attr_t, flags_t
-from ..attrs cimport attr_id_t
-from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
-from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 
-from ._dict_proxies import SpanGroups
+from .span_groups import SpanGroups
 
 from ..attrs cimport (
     DEP,
@@ -57,12 +50,6 @@ from ..attrs import IDS, intify_attr
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
 from ..morphology import Morphology
-from .. import util
-from .. import parts_of_speech
-from .. import schemas
-from .underscore import Underscore, get_ext_args
-from .retokenizer import Retokenizer
-from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from ..util import get_words_and_spaces
 from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from .retokenizer import Retokenizer
diff --git a/spacy/tokens/doc_bin.py b/spacy/tokens/doc_bin.py
index 8a08864d46e..4dda40a05ee 100644
--- a/spacy/tokens/doc_bin.py
+++ b/spacy/tokens/doc_bin.py
@@ -10,7 +10,9 @@
 from ..attrs import IDS, ORTH, SPACY, intify_attr
 from ..compat import copy_reg
 from ..errors import Errors
-from ..util import ensure_path, SimpleFrozenList
+from ..util import SimpleFrozenList, ensure_path
+from ..vocab import Vocab
+from .doc import Doc
 from .span_groups import SpanGroups
 
 # fmt: off
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
index 33322d0187f..11fc535e7ca 100644
--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@@ -1,8 +1,9 @@
-from ..vocab cimport Vocab
-from ..typedefs cimport hash_t
-from ..morphology cimport MorphAnalysisC
 from libcpp.memory cimport shared_ptr
 
+from ..morphology cimport MorphAnalysisC
+from ..typedefs cimport hash_t
+from ..vocab cimport Vocab
+
 
 cdef class MorphAnalysis:
     cdef readonly Vocab vocab
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index 2ee7565ea85..80033dd8657 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -9,10 +9,6 @@ from libcpp.memory cimport shared_ptr
 from ..morphology cimport MorphAnalysisC, check_feature, get_by_field, list_features
 from ..typedefs cimport attr_t, hash_t
 from ..vocab cimport Vocab
-from ..typedefs cimport hash_t, attr_t
-from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
-from libcpp.memory cimport shared_ptr
-from cython.operator cimport dereference as deref
 
 
 cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx
index 68631e7547f..7b6501d4442 100644
--- a/spacy/tokens/retokenizer.pyx
+++ b/spacy/tokens/retokenizer.pyx
@@ -16,6 +16,9 @@ from .token cimport Token
 
 from ..attrs import intify_attrs
 from ..errors import Errors
+from ..util import SimpleFrozenDict
+from .underscore import is_writable_attr
+
 from ..strings cimport get_string_id
 
 
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index 68f722a13cb..fb592e68bd8 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,4 +1,3 @@
-from libcpp.memory cimport shared_ptr
 cimport numpy as np
 from libcpp.memory cimport shared_ptr
 
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 66cae659e75..fb63b3bf959 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -40,6 +40,7 @@ from .. import parts_of_speech
 from ..attrs import IOB_STRINGS
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
+
 from cython.operator cimport dereference as deref
 
 from cython.operator cimport dereference as deref
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 9445d0b63a5..adfc2bb6658 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,6 +1,3 @@
-from .corpus import Corpus, JsonlCorpus  # noqa: F401
-from .example import Example, validate_examples, validate_get_examples  # noqa: F401
-from .example import validate_distillation_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py
index 22d3e42dc13..2aa77ce55b1 100644
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
@@ -1,4 +1,17 @@
 import itertools
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    TypeVar,
+    Union,
+)
+
 from thinc.schedules import Schedule
 
 from thinc.schedules import Schedule
diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index 21c3d56a118..c2f3b8b51fa 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -1,11 +1,9 @@
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import Callable, Optional
 
 from ..errors import Errors
+from ..language import Language
 from ..util import load_model, logger, registry
 
-if TYPE_CHECKING:
-    from ..language import Language
-
 
 @registry.callbacks("spacy.copy_from_base_model.v1")
 def create_copy_from_base_model(
diff --git a/spacy/training/converters/json_to_docs.py b/spacy/training/converters/json_to_docs.py
index 1ff7a64e09d..a78c39aea7b 100644
--- a/spacy/training/converters/json_to_docs.py
+++ b/spacy/training/converters/json_to_docs.py
@@ -1,9 +1,13 @@
 import srsly
-from ..gold_io import json_iterate, json_to_annotations
-from ..example import annotations_to_doc
-from ..example import _fix_legacy_dict_data, _parse_example_dict_data
-from ...util import load_model
+
 from ...lang.mul import MultiLanguage
+from ...util import load_model
+from ..example import (
+    _fix_legacy_dict_data,
+    _parse_example_dict_data,
+    annotations_to_doc,
+)
+from ..gold_io import json_iterate, json_to_annotations
 
 
 def json_to_docs(input_data, model=None, **kwargs):
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 3b3df2115e8..d121c9aa56f 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,4 +1,5 @@
 from collections.abc import Iterable as IterableInstance
+
 import numpy
 
 from murmurhash.mrmr cimport hash64
diff --git a/spacy/ty.py b/spacy/ty.py
index ac09cb336ac..e4f34a5f651 100644
--- a/spacy/ty.py
+++ b/spacy/ty.py
@@ -1,5 +1,17 @@
-from typing import TYPE_CHECKING, Protocol, runtime_checkable
-from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Protocol,
+    Sequence,
+    runtime_checkable,
+)
+
+from thinc.api import Model, Optimizer
 
 if TYPE_CHECKING:
     from .language import Language
diff --git a/spacy/util.py b/spacy/util.py
index 624fffe865d..ae9837e3afe 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -2,13 +2,7 @@
 import importlib
 import importlib.metadata
 import importlib.util
-import re
-from pathlib import Path
-import thinc
-from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
-from thinc.api import ConfigValidationError, Model, constant as constant_schedule
-from thinc.api import fix_random_seed, set_gpu_allocator
-import functools
+import inspect
 import itertools
 import logging
 import os
@@ -65,9 +59,6 @@
     cupy = None
 
 
-from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows
-from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, is_windows
 from .errors import Errors, Warnings

From d608f30295c7bd8f0a8db933a2b876e19d41851f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 12:43:21 +0200
Subject: [PATCH 390/504] Fix span <-> underscore import cycle

---
 spacy/tokens/underscore.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index 63706851286..c3e3641d454 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -3,10 +3,10 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 from ..errors import Errors
-from .span import Span
 
 if TYPE_CHECKING:
     from .doc import Doc
+    from .span import Span
     from .token import Token
 
 
@@ -40,7 +40,10 @@ def __init__(
         object.__setattr__(self, "_doc", obj.doc)
         object.__setattr__(self, "_start", start)
         object.__setattr__(self, "_end", end)
-        if type(obj) == Span:
+        # We used to check if obj is a span, however, this introduces an
+        # import cycle between the span and underscore modeles. So we
+        # do a structural type check instead.
+        if hasattr(obj, "id") and hasattr(obj, "label") and hasattr(obj, "kb_id"):
             object.__setattr__(self, "_label", label)
             object.__setattr__(self, "_kb_id", kb_id)
             object.__setattr__(self, "_span_id", span_id)

From feb9ae9e188f93351e1486ba6c9e809fa4ce0204 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 12:43:45 +0200
Subject: [PATCH 391/504] Fix training.callbacks <-> language import cycle

---
 spacy/training/callbacks.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index c2f3b8b51fa..21c3d56a118 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -1,9 +1,11 @@
-from typing import Callable, Optional
+from typing import TYPE_CHECKING, Callable, Optional
 
 from ..errors import Errors
-from ..language import Language
 from ..util import load_model, logger, registry
 
+if TYPE_CHECKING:
+    from ..language import Language
+
 
 @registry.callbacks("spacy.copy_from_base_model.v1")
 def create_copy_from_base_model(

From aa924a5effb1d2c2b901e3fba44c97b36937e164 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 6 Jul 2023 15:20:13 +0200
Subject: [PATCH 392/504] Disallow False for first/last arguments of add_pipe
 (#12793)

* Literal True for first/last options

* add test case

* update docs

* remove old redundant test case

* black formatting

* use Optional typing in docstrings

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>

---------

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 spacy/errors.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index 15ef239ea5b..c4a69935ce7 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -992,6 +992,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
              "{existing_value}.")
     E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
+    E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
 
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}

From 12e6a3f6b2467ecb5eea99e728db2899da5091a6 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 19 Jul 2023 16:38:29 +0200
Subject: [PATCH 393/504] merge fixes

---
 spacy/pipeline/transition_parser.pyx          | 27 +++++++------------
 .../tests/serialize/test_serialize_config.py  |  1 +
 spacy/vectors.pyx                             |  2 +-
 3 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 463b54f5fdd..878c8425a01 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -7,15 +7,9 @@ from typing import Dict, Iterable, List, Optional, Tuple
 cimport numpy as np
 from cymem.cymem cimport Pool
 
-from itertools import islice
-
-from libc.stdlib cimport calloc, free
-from libc.string cimport memcpy, memset
-from libcpp.vector cimport vector
-
 import contextlib
 import random
-import warnings
+from itertools import islice
 
 import numpy
 import numpy.random
@@ -24,29 +18,21 @@ from thinc.api import (
     CupyOps,
     NumpyOps,
     Optimizer,
-    chain,
     get_array_module,
     get_ops,
     set_dropout_rate,
-    softmax_activation,
-    use_ops,
 )
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 
 from ..ml.tb_framework import TransitionModelInputs
 
 from ..tokens.doc cimport Doc
-from ._parser_internals cimport _beam_utils
-from ._parser_internals.search cimport Beam
-from ._parser_internals.stateclass cimport StateC, StateClass
-from .trainable_pipe cimport TrainablePipe
-
-from ._parser_internals import _beam_utils
-
 from ..typedefs cimport weight_t
 from ..vocab cimport Vocab
+from ._parser_internals cimport _beam_utils
+from ._parser_internals.stateclass cimport StateC, StateClass
 from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from .trainable_pipe cimport TrainablePipe
 
 from .. import util
 from ..errors import Errors, Warnings
@@ -78,6 +64,11 @@ cdef extern from "<algorithm>" namespace "std" nogil:
     bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
 
 
+
+# TODO: Remove when we switch to Cython 3.
+cdef extern from "<algorithm>" namespace "std" nogil:
+    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
+
 NUMPY_OPS = NumpyOps()
 
 
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 646ce0f5d48..b351ea80121 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -18,6 +18,7 @@
     build_Tok2Vec_model,
 )
 from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
+from spacy.training import Example
 from spacy.util import (
     load_config,
     load_config_from_str,
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 876c56bed1d..111a9d01e08 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -130,7 +130,7 @@ cdef class Vectors(BaseVectors):
     cdef readonly unicode eow
     cdef readonly attr_id_t attr
 
-    def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
+    def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">", attr="ORTH"):
         """Create a new vector store.
 
         strings (StringStore): The string store.

From 31880fbc6778ccb3efe9f3c665d0ab15c94ccf34 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 19 Jul 2023 17:41:29 +0200
Subject: [PATCH 394/504] cython fixes and cleanup

---
 spacy/matcher/phrasematcher.pyx               |  2 -
 spacy/ml/tb_framework.pyx                     | 55 ++++++++++---------
 spacy/morphology.pyx                          |  6 +-
 spacy/parts_of_speech.pxd                     |  2 +-
 spacy/pipeline/_parser_internals/search.pyx   | 12 ++--
 .../_parser_internals/transition_system.pxd   |  4 +-
 .../_parser_internals/transition_system.pyx   | 21 ++++---
 spacy/strings.pyx                             |  9 +--
 spacy/tests/parser/_search.pyx                | 49 +++++++++--------
 spacy/tokens/doc.pyx                          |  2 +-
 spacy/tokens/span.pyx                         |  2 -
 11 files changed, 77 insertions(+), 87 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 5c79cda6c1e..e8ad394b0d7 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -161,7 +161,6 @@ cdef class PhraseMatcher:
         del self._callbacks[key]
         del self._docs[key]
 
-
     def _add_from_arrays(self, key, specs, *, on_match=None):
         """Add a preprocessed list of specs, with an optional callback.
 
@@ -203,7 +202,6 @@ cdef class PhraseMatcher:
                 result = internal_node
             map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
 
-
     def add(self, key, docs, *, on_match=None):
         """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
         key, a list of one or more patterns, and (optionally) an on_match callback.
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index fd0af12ceab..ed04045a6a5 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -1,5 +1,5 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, TypeVar, cast
+from typing import Any, List, Optional, Tuple, cast
 
 from libc.stdlib cimport calloc, free, realloc
 from libc.string cimport memcpy, memset
@@ -23,7 +23,7 @@ from thinc.api import (
 
 from thinc.backends.cblas cimport CBlas, saxpy, sgemm
 
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
+from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
 
 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
@@ -136,7 +136,7 @@ def init(
     Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
 ):
     if X is not None:
-        docs, moves = X
+        docs, _ = X
         model.get_ref("tok2vec").initialize(X=docs)
     else:
         model.get_ref("tok2vec").initialize()
@@ -145,7 +145,7 @@ def init(
         current_nO = model.maybe_get_dim("nO")
         if current_nO is None or current_nO != inferred_nO:
             model.attrs["resize_output"](model, inferred_nO)
-    nO = model.get_dim("nO")
+    # nO = model.get_dim("nO")
     nP = model.get_dim("nP")
     nH = model.get_dim("nH")
     nI = model.get_dim("nI")
@@ -192,9 +192,10 @@ class TransitionModelInputs:
         self,
         docs: List[Doc],
         moves: TransitionSystem,
-        actions: Optional[List[Ints1d]]=None,
-        max_moves: int=0,
-        states: Optional[List[State]]=None):
+        actions: Optional[List[Ints1d]] = None,
+        max_moves: int = 0,
+        states: Optional[List[State]] = None,
+    ):
         """
         actions (Optional[List[Ints1d]]): actions to apply for each Doc.
         docs (List[Doc]): Docs to predict transition sequences for.
@@ -234,12 +235,12 @@ def forward(model, inputs: TransitionModelInputs, is_train: bool):
         return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
     else:
         return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
-            feats, backprop_feats, seen_mask, is_train, actions=actions,
-            max_moves=inputs.max_moves)
+                                 feats, backprop_feats, seen_mask, is_train, actions=actions,
+                                 max_moves=inputs.max_moves)
 
 
 def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
-                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+                        np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None):
     cdef vector[StateC*] c_states
     cdef StateClass state
     for state in states:
@@ -257,9 +258,10 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State
 
     return (states, scores), backprop
 
+
 cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
                        WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i, j
+    cdef int i
     cdef vector[StateC *] unfinished
     cdef ActivationsC activations = _alloc_activations(sizes)
     cdef np.ndarray step_scores
@@ -276,7 +278,7 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
             if actions is None:
                 # Validate actions, argmax, take action.
                 c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
-                    sizes.states)
+                                   sizes.states)
             else:
                 c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
             for i in range(sizes.states):
@@ -302,8 +304,8 @@ def _forward_fallback(
     backprop_feats,
     seen_mask,
     is_train: bool,
-    actions: Optional[List[Ints1d]]=None,
-    max_moves: int=0):
+    actions: Optional[List[Ints1d]] = None,
+        max_moves: int = 0):
     nF = model.get_dim("nF")
     output = model.get_ref("output")
     hidden_b = model.get_param("hidden_b")
@@ -371,7 +373,7 @@ def _forward_fallback(
             for clas in set(model.attrs["unseen_classes"]):
                 if (d_scores[:, clas] < 0).any():
                     model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False
+        d_scores *= seen_mask == False  # no-cython-lint
         # Calculate the gradients for the parameters of the output layer.
         # The weight gemm is (nS, nO) @ (nS, nH).T
         output.inc_grad("b", d_scores.sum(axis=0))
@@ -571,13 +573,13 @@ cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
         A._max_size = n.states
     else:
         A.token_ids = <int*>realloc(A.token_ids,
-            n.states * n.feats * sizeof(A.token_ids[0]))
+                                    n.states * n.feats * sizeof(A.token_ids[0]))
         A.unmaxed = <float*>realloc(A.unmaxed,
-            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
         A.hiddens = <float*>realloc(A.hiddens,
-            n.states * n.hiddens * sizeof(A.hiddens[0]))
+                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
         A.is_valid = <int*>realloc(A.is_valid,
-            n.states * n.classes * sizeof(A.is_valid[0]))
+                                   n.states * n.classes * sizeof(A.is_valid[0]))
         A._max_size = n.states
     A._curr_size = n.states
 
@@ -599,9 +601,9 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
     else:
         # Compute hidden-to-output
         sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-                      1.0, <const float *>A.hiddens, n.hiddens,
-                      <const float *>W.hidden_weights, n.hiddens,
-                      0.0, scores, n.classes)
+                     1.0, <const float *>A.hiddens, n.hiddens,
+                     <const float *>W.hidden_weights, n.hiddens,
+                     0.0, scores, n.classes)
         # Add bias
         for i in range(n.states):
             saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
@@ -617,12 +619,12 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
                 scores[i*n.classes+j] = min_
 
 
-cdef void _sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f, i
+cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
+                              const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f
     cdef const float* feature
     cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces
+    cdef int O = n.hiddens * n.pieces  # no-cython-lint
     cdef int F = n.feats
     cdef int T = n.tokens
     padding = cached + (T * F * O)
@@ -637,4 +639,3 @@ cdef void _sum_state_features(CBlas cblas, float* output,
                 feature = &cached[idx]
             saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
         token_ids += F
-
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index a6abf8abec7..5d5fa0369f8 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -80,15 +80,13 @@ cdef class Morphology:
         out.sort(key=lambda x: x[0])
         return dict(out)
 
-
     def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
         norm_feats_string = self.FEATURE_SEP.join([
-                self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
+            self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
             for field, values in feats.items()
-        ])
+            ])
         return norm_feats_string or self.EMPTY_MORPH
 
-
     cdef hash_t _add(self, features):
         """Insert a morphological analysis in the morphology table, if not
         already present. The morphological analysis may be provided in the UD
diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index 01f116ea688..22a571be7b0 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -8,7 +8,7 @@ cpdef enum univ_pos_t:
     ADV = symbols.ADV
     AUX = symbols.AUX
     CONJ = symbols.CONJ
-    CCONJ  = symbols.CCONJ  # U20
+    CCONJ = symbols.CCONJ  # U20
     DET = symbols.DET
     INTJ = symbols.INTJ
     NOUN = symbols.NOUN
diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
index 251eaa805cb..578299b56ae 100644
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -1,11 +1,8 @@
 # cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
-from libc.math cimport exp, log
-from libc.string cimport memcpy, memset
-
-import math
-
 from cymem.cymem cimport Pool
+from libc.math cimport exp
+from libc.string cimport memcpy, memset
 from preshed.maps cimport PreshMap
 
 
@@ -70,7 +67,7 @@ cdef class Beam:
             self.costs[i][j] = costs[j]
 
     cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
-        cdef int i, j
+        cdef int i
         for i in range(self.width):
             memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
             memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
@@ -176,7 +173,6 @@ cdef class Beam:
         beam-width, and n is the number of classes.
         """
         cdef Entry entry
-        cdef weight_t score
         cdef _State* s
         cdef int i, j, move_id
         assert self.size >= 1
@@ -269,7 +265,7 @@ cdef class MaxViolation:
                 # This can happen from non-monotonic actions
                 # If we find a better gold analysis this way, be sure to keep it.
                 elif pred._states[i].loss <= 0 \
-                and tuple(pred.histories[i]) not in seen_golds:
+                        and tuple(pred.histories[i]) not in seen_golds:
                     g_scores.append(pred._states[i].score)
                     g_hist.append(list(pred.histories[i]))
             for i in range(gold.size):
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 66cc7747b69..08baed932ba 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -60,7 +60,7 @@ cdef class TransitionSystem:
 
 
 cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-    int batch_size) nogil
+                          int batch_size) nogil
 
 cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-        int nr_class, int batch_size) nogil
+                             int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index 7bd39ba43c5..ae1cf890f3e 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -291,19 +291,19 @@ cdef class TransitionSystem:
 
 
 cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-    int batch_size) nogil:
-        cdef int i
-        cdef Transition action
-        cdef StateC* state
-        for i in range(batch_size):
-            state = states[i]
-            action = moves.c[actions[i]]
-            action.do(state, action.label)
-            state.history.push_back(action.clas)
+                          int batch_size) nogil:
+    cdef int i
+    cdef Transition action
+    cdef StateC* state
+    for i in range(batch_size):
+        state = states[i]
+        action = moves.c[actions[i]]
+        action.do(state, action.label)
+        state.history.push_back(action.clas)
 
 
 cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-    int nr_class, int batch_size) nogil:
+                             int nr_class, int batch_size) nogil:
     is_valid = <int*>calloc(moves.n_moves, sizeof(int))
     cdef int i, guess
     cdef Transition action
@@ -319,4 +319,3 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa
             action.do(states[i], action.label)
             states[i].history.push_back(guess)
     free(is_valid)
-
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 3f01017d8f7..d0ec2f730db 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,7 +1,6 @@
 # cython: infer_types=True
-from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Iterable, Iterator, List, Optional, Tuple, Union
 
-cimport cython
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from libcpp.set cimport set
@@ -272,7 +271,6 @@ cdef class StringStore:
         cdef int n_length_bytes
         cdef int i
         cdef Utf8Str* string = <Utf8Str*>self.mem.alloc(1, sizeof(Utf8Str))
-        cdef uint32_t ulength = length
         if length < sizeof(string.s):
             string.s[0] = <unsigned char>length
             memcpy(&string.s[1], chars, length)
@@ -330,7 +328,7 @@ cpdef hash_t get_string_id(object string_or_hash) except -1:
 
     try:
         return hash_string(string_or_hash)
-    except:
+    except:   # no-cython-lint
         if _try_coerce_to_hash(string_or_hash, &str_hash):
             # Coerce the integral key to the expected primitive hash type.
             # This ensures that custom/overloaded "primitive" data types
@@ -347,6 +345,5 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
     try:
         out_hash[0] = key
         return True
-    except:
+    except:  # no-cython-lint
         return False
-
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
index 0983159b75d..cd9e6b2f5ee 100644
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@@ -2,7 +2,7 @@
 from cymem.cymem cimport Pool
 
 from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
-from spacy.typedefs cimport class_t, weight_t
+from spacy.typedefs cimport class_t
 
 import pytest
 
@@ -42,32 +42,35 @@ cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
     state = <TestState*>state
     mem.free(state)
 
+
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width",
-    [
-        (2, 3),
-        (3, 6),
-        (4, 20),
-    ]
-)
+                         [
+                             (2, 3),
+                             (3, 6),
+                             (4, 20),
+                         ]
+                         )
 def test_init(nr_class, beam_width):
     b = Beam(nr_class, beam_width)
     assert b.size == 1
     assert b.width == beam_width
     assert b.nr_class == nr_class
 
+
 @cytest
 def test_init_violn():
     MaxViolation()
 
+
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length",
-    [
-        (2, 3, 3),
-        (3, 6, 15),
-        (4, 20, 32),
-    ]
-)
+                         [
+                             (2, 3, 3),
+                             (3, 6, 15),
+                             (4, 20, 32),
+                         ]
+                         )
 def test_initialize(nr_class, beam_width, length):
     b = Beam(nr_class, beam_width)
     b.initialize(initialize, destroy, length, NULL)
@@ -79,11 +82,11 @@ def test_initialize(nr_class, beam_width, length):
 
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length,extra",
-    [
-        (2, 3, 4, None),
-        (3, 6, 15, u"test beam 1"),
-    ]
-)
+                         [
+                             (2, 3, 4, None),
+                             (3, 6, 15, u"test beam 1"),
+                         ]
+                         )
 def test_initialize_extra(nr_class, beam_width, length, extra):
     b = Beam(nr_class, beam_width)
     if extra is None:
@@ -97,11 +100,11 @@ def test_initialize_extra(nr_class, beam_width, length, extra):
 
 @cytest
 @pytest.mark.parametrize("nr_class,beam_width,length",
-    [
-        (3, 6, 15),
-        (4, 20, 32),
-    ]
-)
+                         [
+                             (3, 6, 15),
+                             (4, 20, 32),
+                         ]
+                         )
 def test_transition(nr_class, beam_width, length):
     b = Beam(nr_class, beam_width)
     b.initialize(initialize, destroy, length, NULL)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index ff1a0d310d1..4b8a15a65fd 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1758,7 +1758,7 @@ cdef class Doc:
                                     data["underscore_span"] = {}
                                 if attr not in data["underscore_span"]:
                                     data["underscore_span"][attr] = []
-                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id":_span_id})
+                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id": _span_id})
 
             for attr in underscore:
                 if attr not in user_keys:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 5d52e4fcfc9..5b4f929028a 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -270,7 +270,6 @@ cdef class Span:
 
     @property
     def _(self):
-        cdef SpanC* span_c = self.span_c()
         """Custom extension attributes registered via `set_extension`."""
         cdef SpanC* span_c = self.span_c()
         return Underscore(Underscore.span_extensions, self,
@@ -1018,7 +1017,6 @@ cdef class Span:
             self.id_ = ent_id_
 
 
-
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
     # better candidates

From ecac8dade7fcfc94664aacf50623cfa4835afcac Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 Jul 2023 09:59:19 +0200
Subject: [PATCH 395/504] Update spacy/ml/tb_framework.pyx

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 spacy/ml/tb_framework.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index ed04045a6a5..a48c6b901c7 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -305,7 +305,8 @@ def _forward_fallback(
     seen_mask,
     is_train: bool,
     actions: Optional[List[Ints1d]] = None,
-        max_moves: int = 0):
+    max_moves: int = 0,
+):
     nF = model.get_dim("nF")
     output = model.get_ref("output")
     hidden_b = model.get_param("hidden_b")

From 2042578acb7a24a90c088037f2f665ced2b373a8 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 20 Jul 2023 14:08:29 +0200
Subject: [PATCH 396/504] remove unnecessary line

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/ml/tb_framework.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index a48c6b901c7..6c5c29d8549 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -145,7 +145,6 @@ def init(
         current_nO = model.maybe_get_dim("nO")
         if current_nO is None or current_nO != inferred_nO:
             model.attrs["resize_output"](model, inferred_nO)
-    # nO = model.get_dim("nO")
     nP = model.get_dim("nP")
     nH = model.get_dim("nH")
     nI = model.get_dim("nI")

From 94707a87238f06cd02b4ce305a277831e62401e6 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 31 Jul 2023 15:54:35 +0200
Subject: [PATCH 397/504] Recommend lookups tables from URLs or other loaders
 (#12283)

* Recommend lookups tables from URLs or other loaders

Shift away from the `lookups` extra (which isn't removed, just no longer
mentioned) and recommend loading data from the `spacy-lookups-data` repo
or other sources rather than the `spacy-lookups-data` package.

If the tables can't be loaded from the `lookups` registry in the
lemmatizer, show how to specify the tables in `[initialize]` rather than
recommending the `spacy-lookups-data` package.

* Add tests for some rule-based lemmatizers

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/errors.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index c4a69935ce7..212f4b98c46 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,6 +1,8 @@
 import warnings
 from typing import Literal
 
+from . import about
+
 
 class ErrorsWithCodes(type):
     def __getattribute__(self, code):
@@ -993,6 +995,18 @@ class Errors(metaclass=ErrorsWithCodes):
              "{existing_value}.")
     E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
     E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
+    E4010 = ("Required lemmatizer table(s) {missing_tables} not found in "
+             "[initialize] or in registered lookups (spacy-lookups-data). An "
+             "example for how to load lemmatizer tables in [initialize]:\n\n"
+             "[initialize.components]\n\n"
+             "[initialize.components.{pipe_name}]\n\n"
+             "[initialize.components.{pipe_name}.lookups]\n"
+             '@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
+             "lang = ${{nlp.lang}}\n"
+             f'url = "{about.__lookups_url__}"\n'
+             "tables = {tables}\n"
+             "# or required tables only: tables = {required_tables}\n")
+    E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}

From 1d788f0b61c5bd3ecd8827981441fb4fa4fe6f2c Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 1 Aug 2023 22:24:02 +0900
Subject: [PATCH 398/504] Accept multiple code files in all CLI commands
 (#12101)

* Add support for multiple code files to all relevant commands

Prior to this, only the package command supported multiple code files.

* Update docs

* Add debug data test, plus generic fixtures

One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.

* Add debug config test and restructure

The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.

* Use a more generic, parametrized test

* Add output arg for assemble and pretrain

Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.

* Add evaluate test and some cleanup

* Mark tests as slow

* Revert argument name change

* Apply suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Format API CLI docs

* isort

* Fix imports in tests

* isort

* Undo changes to package CLI help

* Fix python executable and lang code in test

* Fix executable in another test

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 website/docs/api/cli.mdx | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index e5bee209281..9fddbbd01db 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1341,19 +1341,19 @@ be provided.
 > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
 > ```
 
-| Name                    | Description                                                                                                                                                                          |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                 | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`             | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
-| `pipe_name`             | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
-| `threshold_key`         | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
-| `scores_key`            | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
-| `--n_trials`, `-n`      | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
-| `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gpu-id`, `-g`        | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--gold-preproc`, `-G`  | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+| Name                     | Description                                                                                                                                                                                            |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                  | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                             |
+| `data_path`              | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                                  |
+| `pipe_name`              | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                                           |
+| `threshold_key`          | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                                          |
+| `scores_key`             | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                                              |
+| `--n_trials`, `-n`       | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                                     |
+| `--code`, `-c`           | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gpu-id`, `-g`         | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
+| `--gold-preproc`, `-G`   | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                                                |
+| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                                       |
+| `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
 
 ## assemble {#assemble tag="command"}
 ## assemble {#assemble tag="command"}

From 505fdc752c29c046734c1abb3fb15e9265ebcc27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 14:38:05 +0100
Subject: [PATCH 399/504] Revert "Reimplement distillation with oracle cut size
 (#12214)"

This reverts commit e27c60a70263f7ab17968964de37e938653e37a2.
---
 spacy/ml/tb_framework.pyx            |  4 +-
 spacy/pipeline/transition_parser.pyx |  5 +--
 spacy/tests/parser/test_model.py     | 61 ----------------------------
 spacy/tests/parser/test_ner.py       |  5 +--
 spacy/tests/parser/test_parse.py     |  5 +--
 5 files changed, 4 insertions(+), 76 deletions(-)
 delete mode 100644 spacy/tests/parser/test_model.py

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 6c5c29d8549..e497643f0cd 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -267,11 +267,9 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
     cdef np.ndarray step_actions
 
     scores = []
-    while sizes.states >= 1 and (actions is None or len(actions) > 0):
+    while sizes.states >= 1:
         step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
         step_actions = actions[0] if actions is not None else None
-        assert step_actions is None or step_actions.size == sizes.states, \
-            f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
         with nogil:
             _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
             if actions is None:
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 878c8425a01..41c95c94747 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -65,10 +65,6 @@ cdef extern from "<algorithm>" namespace "std" nogil:
 
 
 
-# TODO: Remove when we switch to Cython 3.
-cdef extern from "<algorithm>" namespace "std" nogil:
-    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
-
 NUMPY_OPS = NumpyOps()
 
 
@@ -704,6 +700,7 @@ class Parser(TrainablePipe):
                 length += 1
         return states
 
+
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
         sequence or a cap. A long doc will get multiple states. Let's say we
diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py
deleted file mode 100644
index 8c1cf7a9346..00000000000
--- a/spacy/tests/parser/test_model.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import numpy
-import pytest
-
-from spacy.lang.en import English
-from spacy.ml.tb_framework import TransitionModelInputs
-from spacy.training import Example
-
-TRAIN_DATA = [
-    (
-        "They trade mortgage-backed securities.",
-        {
-            "heads": [1, 1, 4, 4, 5, 1, 1],
-            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
-        },
-    ),
-    (
-        "I like London and Berlin.",
-        {
-            "heads": [1, 1, 1, 2, 2, 1],
-            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
-        },
-    ),
-]
-
-
-@pytest.fixture
-def nlp_parser():
-    nlp = English()
-    parser = nlp.add_pipe("parser")
-
-    train_examples = []
-    for text, annotations in TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-        for dep in annotations["deps"]:
-            parser.add_label(dep)
-    nlp.initialize()
-
-    return nlp, parser
-
-
-def test_incorrect_number_of_actions(nlp_parser):
-    nlp, parser = nlp_parser
-    doc = nlp.make_doc("test")
-
-    # Too many actions for the number of docs
-    with pytest.raises(AssertionError):
-        parser.model.predict(
-            TransitionModelInputs(
-                docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
-            )
-        )
-
-    # Too few actions for the number of docs
-    with pytest.raises(AssertionError):
-        parser.model.predict(
-            TransitionModelInputs(
-                docs=[doc, doc],
-                moves=parser.moves,
-                actions=[numpy.array([0], dtype="i")],
-            )
-        )
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index f0efc3a63fd..bb9b7653ce3 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -623,9 +623,7 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-@pytest.mark.slow
-@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
-def test_distill(max_moves):
+def test_distill():
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -643,7 +641,6 @@ def test_distill(max_moves):
 
     student = English()
     student_ner = student.add_pipe("ner")
-    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 636bb887789..d25eb165acb 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -462,9 +462,7 @@ def test_is_distillable():
     assert parser.is_distillable
 
 
-@pytest.mark.slow
-@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
-def test_distill(max_moves):
+def test_distill():
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []
@@ -482,7 +480,6 @@ def test_distill(max_moves):
 
     student = English()
     student_parser = student.add_pipe("parser")
-    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
     student_parser.initialize(
         get_examples=lambda: train_examples, labels=teacher_parser.label_data
     )

From 08c0a97d2c0b57a616f6cf940fea1c6cf159d9d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:23:08 +0100
Subject: [PATCH 400/504] Revert "Merge the parser refactor into `v4` (#10940)"

This reverts commit a183db3cefe0713e828868b43daf61c464cae05c.
---
 setup.py                                      |   6 +-
 spacy/cli/templates/quickstart_training.jinja |  12 +-
 spacy/compat.py                               |   5 +
 spacy/errors.py                               |   7 +-
 spacy/ml/_precomputable_affine.py             | 164 +++++
 spacy/ml/models/parser.py                     | 177 +++--
 spacy/ml/parser_model.pxd                     |  49 ++
 spacy/ml/parser_model.pyx                     | 500 ++++++++++++++
 spacy/ml/tb_framework.pxd                     |  28 -
 spacy/ml/tb_framework.py                      |  50 ++
 spacy/ml/tb_framework.pyx                     | 639 ------------------
 .../_parser_internals/_parser_utils.pxd       |   2 -
 .../_parser_internals/_parser_utils.pyx       |  22 -
 spacy/pipeline/_parser_internals/_state.pxd   |  60 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 -
 spacy/pipeline/_parser_internals/batch.pxd    |   2 -
 spacy/pipeline/_parser_internals/batch.pyx    |  52 --
 spacy/pipeline/_parser_internals/ner.pyx      |   6 +-
 .../pipeline/_parser_internals/stateclass.pyx |   7 -
 .../_parser_internals/transition_system.pxd   |   7 -
 .../_parser_internals/transition_system.pyx   |  69 --
 .../{dep_parser.py => dep_parser.pyx}         |  20 +-
 spacy/pipeline/{ner.py => ner.pyx}            |  35 +-
 spacy/pipeline/transition_parser.pxd          |  21 +
 spacy/tests/parser/test_ner.py                |  10 +-
 spacy/tests/parser/test_parse.py              |  74 +-
 spacy/tests/pipeline/test_tok2vec.py          |   2 +-
 .../tests/serialize/test_serialize_config.py  |  56 +-
 spacy/tests/test_misc.py                      |  55 +-
 spacy/training/example.pyx                    |   2 +-
 website/docs/api/architectures.mdx            |  26 +-
 website/docs/api/cli.mdx                      |   8 +-
 website/docs/api/legacy.mdx                   |   2 +-
 .../docs/usage/embeddings-transformers.mdx    |   6 +-
 34 files changed, 1092 insertions(+), 1092 deletions(-)
 create mode 100644 spacy/ml/_precomputable_affine.py
 create mode 100644 spacy/ml/parser_model.pxd
 create mode 100644 spacy/ml/parser_model.pyx
 delete mode 100644 spacy/ml/tb_framework.pxd
 create mode 100644 spacy/ml/tb_framework.py
 delete mode 100644 spacy/ml/tb_framework.pyx
 delete mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pxd
 delete mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pyx
 delete mode 100644 spacy/pipeline/_parser_internals/batch.pxd
 delete mode 100644 spacy/pipeline/_parser_internals/batch.pyx
 rename spacy/pipeline/{dep_parser.py => dep_parser.pyx} (96%)
 rename spacy/pipeline/{ner.py => ner.pyx} (93%)
 create mode 100644 spacy/pipeline/transition_parser.pxd

diff --git a/setup.py b/setup.py
index a4a87d68aec..0eb529c2098 100755
--- a/setup.py
+++ b/setup.py
@@ -32,10 +32,12 @@
     "spacy.kb.candidate",
     "spacy.kb.kb",
     "spacy.kb.kb_in_memory",
-    "spacy.ml.tb_framework",
+    "spacy.ml.parser_model",
     "spacy.morphology",
+    "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
+    "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
     "spacy.pipeline.sentencizer",
@@ -43,7 +45,6 @@
     "spacy.pipeline.tagger",
     "spacy.pipeline.transition_parser",
     "spacy.pipeline._parser_internals.arc_eager",
-    "spacy.pipeline._parser_internals.batch",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
     "spacy.pipeline._parser_internals.search",
@@ -51,7 +52,6 @@
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.pipeline._parser_internals._beam_utils",
-    "spacy.pipeline._parser_internals._parser_utils",
     "spacy.tokenizer",
     "spacy.training.align",
     "spacy.training.gold_io",
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 36325711d4d..2817147f3e9 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -90,11 +90,12 @@ grad_factor = 1.0
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
+use_upper = false
 nO = null
 
 [components.parser.model.tok2vec]
@@ -110,11 +111,12 @@ grad_factor = 1.0
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = false
 nO = null
 
 [components.ner.model.tok2vec]
@@ -385,11 +387,12 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
+use_upper = true
 nO = null
 
 [components.parser.model.tok2vec]
@@ -402,11 +405,12 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
diff --git a/spacy/compat.py b/spacy/compat.py
index 1e63807a0e8..30459e2e495 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,6 +23,11 @@
 except ImportError:
     cupy = None
 
+if sys.version_info[:2] >= (3, 8):  # Python 3.8+
+    from typing import Literal, Protocol, runtime_checkable
+else:
+    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
+
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/errors.py b/spacy/errors.py
index 212f4b98c46..bf8804e4f36 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -217,12 +217,6 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
-    # v4 warning strings
-    W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
-    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
-            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
-            "to return `True` in `.supports_prior_probs`.")
-
 
 class Errors(metaclass=ErrorsWithCodes):
     E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
@@ -1009,6 +1003,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
+
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
 # fmt: on
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
new file mode 100644
index 00000000000..1c20c622b2c
--- /dev/null
+++ b/spacy/ml/_precomputable_affine.py
@@ -0,0 +1,164 @@
+from thinc.api import Model, normal_init
+
+from ..util import registry
+
+
+@registry.layers("spacy.PrecomputableAffine.v1")
+def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
+    model = Model(
+        "precomputable_affine",
+        forward,
+        init=init,
+        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
+        params={"W": None, "b": None, "pad": None},
+        attrs={"dropout_rate": dropout},
+    )
+    return model
+
+
+def forward(model, X, is_train):
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.get_param("W")
+    # Preallocate array for layer output, including padding.
+    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
+    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
+    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
+
+    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
+    # change its shape to (nF, nO, nP) without breaking existing models. So
+    # we'll squeeze the first dimension here.
+    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
+
+    def backward(dY_ids):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nO, nP), and get back:
+        # (nB, nO, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nO, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        assert dY.ndim == 3
+        assert dY.shape[1] == nO, dY.shape
+        assert dY.shape[2] == nP, dY.shape
+        # nB = dY.shape[0]
+        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
+        Xf = X[ids]
+        Xf = Xf.reshape((Xf.shape[0], nF * nI))
+
+        model.inc_grad("b", dY.sum(axis=0))
+        dY = dY.reshape((dY.shape[0], nO * nP))
+
+        Wopfi = W.transpose((1, 2, 0, 3))
+        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
+        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
+
+        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
+        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
+        # (o, p, f, i) --> (f, o, p, i)
+        dWopfi = dWopfi.transpose((2, 0, 1, 3))
+        model.inc_grad("W", dWopfi)
+        return dXf.reshape((dXf.shape[0], nF, nI))
+
+    return Yf, backward
+
+
+def _backprop_precomputable_affine_padding(model, dY, ids):
+    nB = dY.shape[0]
+    nF = model.get_dim("nF")
+    nP = model.get_dim("nP")
+    nO = model.get_dim("nO")
+    # Backprop the "padding", used as a filler for missing values.
+    # Values that are missing are set to -1, and each state vector could
+    # have multiple missing values. The padding has different values for
+    # different missing features. The gradient of the padding vector is:
+    #
+    # for b in range(nB):
+    #     for f in range(nF):
+    #         if ids[b, f] < 0:
+    #             d_pad[f] += dY[b]
+    #
+    # Which can be rewritten as:
+    #
+    # (ids < 0).T @ dY
+    mask = model.ops.asarray(ids < 0, dtype="f")
+    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
+    return d_pad.reshape((1, nF, nO, nP))
+
+
+def init(model, X=None, Y=None):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    if model.has_param("W") and model.get_param("W").any():
+        return
+
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nO, nP, nI)
+    b = model.ops.alloc2f(nO, nP)
+    pad = model.ops.alloc4f(1, nF, nO, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
+        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors = vectors.reshape((vectors.shape[0], nO, nP))
+        vectors += b
+        vectors = model.ops.asarray(vectors)
+        if nP >= 2:
+            return model.ops.maxout(vectors)[0]
+        else:
+            return vectors * (vectors >= 0)
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = model.get_param("W").copy()
+    b = model.get_param("b").copy()
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("b", b)
+        else:
+            break
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 422abf4e260..a70d84dea8f 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,66 +1,23 @@
-import warnings
-from typing import Any, List, Literal, Optional, Tuple
-
-from thinc.api import Model
+from typing import Optional, List, cast
+from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
 from thinc.types import Floats2d
 
-from ...errors import Errors, Warnings
-from ...tokens.doc import Doc
+from ...errors import Errors
+from ...compat import Literal
 from ...util import registry
+from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
-
-TransitionSystem = Any  # TODO
-State = Any  # TODO
-
-
-@registry.architectures.register("spacy.TransitionBasedParser.v2")
-def transition_parser_v2(
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    state_type: Literal["parser", "ner"],
-    extra_state_tokens: bool,
-    hidden_width: int,
-    maxout_pieces: int,
-    use_upper: bool,
-    nO: Optional[int] = None,
-) -> Model:
-    if not use_upper:
-        warnings.warn(Warnings.W400)
-
-    return build_tb_parser_model(
-        tok2vec,
-        state_type,
-        extra_state_tokens,
-        hidden_width,
-        maxout_pieces,
-        nO=nO,
-    )
-
-
-@registry.architectures.register("spacy.TransitionBasedParser.v3")
-def transition_parser_v3(
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    state_type: Literal["parser", "ner"],
-    extra_state_tokens: bool,
-    hidden_width: int,
-    maxout_pieces: int,
-    nO: Optional[int] = None,
-) -> Model:
-    return build_tb_parser_model(
-        tok2vec,
-        state_type,
-        extra_state_tokens,
-        hidden_width,
-        maxout_pieces,
-        nO=nO,
-    )
+from ...tokens import Doc
 
 
+@registry.architectures("spacy.TransitionBasedParser.v2")
 def build_tb_parser_model(
     tok2vec: Model[List[Doc], List[Floats2d]],
     state_type: Literal["parser", "ner"],
     extra_state_tokens: bool,
     hidden_width: int,
     maxout_pieces: int,
+    use_upper: bool,
     nO: Optional[int] = None,
 ) -> Model:
     """
@@ -94,7 +51,14 @@ def build_tb_parser_model(
         feature sets (for the NER) or 13 (for the parser).
     hidden_width (int): The width of the hidden layer.
     maxout_pieces (int): How many pieces to use in the state prediction layer.
-        Recommended values are 1, 2 or 3.
+        Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
+        is replaced with a ReLu non-linearity if use_upper=True, and no
+        non-linearity if use_upper=False.
+    use_upper (bool): Whether to use an additional hidden layer after the state
+        vector in order to predict the action scores. It is recommended to set
+        this to False for large pretrained models such as transformers, and True
+        for smaller networks. The upper layer is computed on CPU, which becomes
+        a bottleneck on larger GPU-based models, where it's also less necessary.
     nO (int or None): The number of actions the model will predict between.
         Usually inferred from data at the beginning of training, or loaded from
         disk.
@@ -105,11 +69,106 @@ def build_tb_parser_model(
         nr_feature_tokens = 6 if extra_state_tokens else 3
     else:
         raise ValueError(Errors.E917.format(value=state_type))
-    return TransitionModel(
-        tok2vec=tok2vec,
-        state_tokens=nr_feature_tokens,
-        hidden_width=hidden_width,
-        maxout_pieces=maxout_pieces,
-        nO=nO,
-        unseen_classes=set(),
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+    tok2vec = chain(
+        tok2vec,
+        list2array(),
+        Linear(hidden_width, t2v_width),
     )
+    tok2vec.set_dim("nO", hidden_width)
+    lower = _define_lower(
+        nO=hidden_width if use_upper else nO,
+        nF=nr_feature_tokens,
+        nI=tok2vec.get_dim("nO"),
+        nP=maxout_pieces,
+    )
+    upper = None
+    if use_upper:
+        with use_ops("cpu"):
+            # Initialize weights at zero, as it's a classification layer.
+            upper = _define_upper(nO=nO, nI=None)
+    return TransitionModel(tok2vec, lower, upper, resize_output)
+
+
+def _define_upper(nO, nI):
+    return Linear(nO=nO, nI=nI, init_W=zero_init)
+
+
+def _define_lower(nO, nF, nI, nP):
+    return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
+
+
+def resize_output(model, new_nO):
+    if model.attrs["has_upper"]:
+        return _resize_upper(model, new_nO)
+    return _resize_lower(model, new_nO)
+
+
+def _resize_upper(model, new_nO):
+    upper = model.get_ref("upper")
+    if upper.has_dim("nO") is None:
+        upper.set_dim("nO", new_nO)
+        return model
+    elif new_nO == upper.get_dim("nO"):
+        return model
+
+    smaller = upper
+    nI = smaller.maybe_get_dim("nI")
+    with use_ops("cpu"):
+        larger = _define_upper(nO=new_nO, nI=nI)
+    # it could be that the model is not initialized yet, then skip this bit
+    if smaller.has_param("W"):
+        larger_W = larger.ops.alloc2f(new_nO, nI)
+        larger_b = larger.ops.alloc1f(new_nO)
+        smaller_W = smaller.get_param("W")
+        smaller_b = smaller.get_param("b")
+        # Weights are stored in (nr_out, nr_in) format, so we're basically
+        # just adding rows here.
+        if smaller.has_dim("nO"):
+            old_nO = smaller.get_dim("nO")
+            larger_W[:old_nO] = smaller_W
+            larger_b[:old_nO] = smaller_b
+            for i in range(old_nO, new_nO):
+                model.attrs["unseen_classes"].add(i)
+
+        larger.set_param("W", larger_W)
+        larger.set_param("b", larger_b)
+    model._layers[-1] = larger
+    model.set_ref("upper", larger)
+    return model
+
+
+def _resize_lower(model, new_nO):
+    lower = model.get_ref("lower")
+    if lower.has_dim("nO") is None:
+        lower.set_dim("nO", new_nO)
+        return model
+
+    smaller = lower
+    nI = smaller.maybe_get_dim("nI")
+    nF = smaller.maybe_get_dim("nF")
+    nP = smaller.maybe_get_dim("nP")
+    larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
+    # it could be that the model is not initialized yet, then skip this bit
+    if smaller.has_param("W"):
+        larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
+        larger_b = larger.ops.alloc2f(new_nO, nP)
+        larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
+        smaller_W = smaller.get_param("W")
+        smaller_b = smaller.get_param("b")
+        smaller_pad = smaller.get_param("pad")
+        # Copy the old weights and padding into the new layer
+        if smaller.has_dim("nO"):
+            old_nO = smaller.get_dim("nO")
+            larger_W[:, 0:old_nO, :, :] = smaller_W
+            larger_pad[:, :, 0:old_nO, :] = smaller_pad
+            larger_b[0:old_nO, :] = smaller_b
+            for i in range(old_nO, new_nO):
+                model.attrs["unseen_classes"].add(i)
+
+        larger.set_param("W", larger_W)
+        larger.set_param("b", larger_b)
+        larger.set_param("pad", larger_pad)
+    model._layers[1] = larger
+    model.set_ref("lower", larger)
+    return model
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
new file mode 100644
index 00000000000..8def6cea53f
--- /dev/null
+++ b/spacy/ml/parser_model.pxd
@@ -0,0 +1,49 @@
+from libc.string cimport memset, memcpy
+from thinc.backends.cblas cimport CBlas
+from ..typedefs cimport weight_t, hash_t
+from ..pipeline._parser_internals._state cimport StateC
+
+
+cdef struct SizesC:
+    int states
+    int classes
+    int hiddens
+    int pieces
+    int feats
+    int embed_width
+
+
+cdef struct WeightsC:
+    const float* feat_weights
+    const float* feat_bias
+    const float* hidden_bias
+    const float* hidden_weights
+    const float* seen_classes
+
+
+cdef struct ActivationsC:
+    int* token_ids
+    float* unmaxed
+    float* scores
+    float* hiddens
+    int* is_valid
+    int _curr_size
+    int _max_size
+
+
+cdef WeightsC get_c_weights(model) except *
+
+cdef SizesC get_c_sizes(model, int batch_size) except *
+
+cdef ActivationsC alloc_activations(SizesC n) nogil
+
+cdef void free_activations(const ActivationsC* A) nogil
+
+cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
+        const WeightsC* W, SizesC n) nogil
+ 
+cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
+
+cdef void cpu_log_loss(float* d_scores,
+        const float* costs, const int* is_valid, const float* scores, int O) nogil
+ 
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
new file mode 100644
index 00000000000..91558683b60
--- /dev/null
+++ b/spacy/ml/parser_model.pyx
@@ -0,0 +1,500 @@
+# cython: infer_types=True, cdivision=True, boundscheck=False
+cimport numpy as np
+from libc.math cimport exp
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from thinc.backends.cblas cimport saxpy, sgemm
+
+import numpy
+import numpy.random
+from thinc.api import Model, CupyOps, NumpyOps, get_ops
+
+from .. import util
+from ..errors import Errors
+from ..typedefs cimport weight_t, class_t, hash_t
+from ..pipeline._parser_internals.stateclass cimport StateClass
+
+
+cdef WeightsC get_c_weights(model) except *:
+    cdef WeightsC output
+    cdef precompute_hiddens state2vec = model.state2vec
+    output.feat_weights = state2vec.get_feat_weights()
+    output.feat_bias = <const float*>state2vec.bias.data
+    cdef np.ndarray vec2scores_W
+    cdef np.ndarray vec2scores_b
+    if model.vec2scores is None:
+        output.hidden_weights = NULL
+        output.hidden_bias = NULL
+    else:
+        vec2scores_W = model.vec2scores.get_param("W")
+        vec2scores_b = model.vec2scores.get_param("b")
+        output.hidden_weights = <const float*>vec2scores_W.data
+        output.hidden_bias = <const float*>vec2scores_b.data
+    cdef np.ndarray class_mask = model._class_mask
+    output.seen_classes = <const float*>class_mask.data
+    return output
+
+
+cdef SizesC get_c_sizes(model, int batch_size) except *:
+    cdef SizesC output
+    output.states = batch_size
+    if model.vec2scores is None:
+        output.classes = model.state2vec.get_dim("nO")
+    else:
+        output.classes = model.vec2scores.get_dim("nO")
+    output.hiddens = model.state2vec.get_dim("nO")
+    output.pieces = model.state2vec.get_dim("nP")
+    output.feats = model.state2vec.get_dim("nF")
+    output.embed_width = model.tokvecs.shape[1]
+    return output
+
+
+cdef ActivationsC alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    resize_activations(&A, n)
+    return A
+
+
+cdef void free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.scores)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
+cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
+    if n.states <= A._max_size:
+        A._curr_size = n.states
+        return
+    if A._max_size == 0:
+        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
+        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
+        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
+        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    else:
+        A.token_ids = <int*>realloc(A.token_ids,
+            n.states * n.feats * sizeof(A.token_ids[0]))
+        A.scores = <float*>realloc(A.scores,
+            n.states * n.classes * sizeof(A.scores[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,
+            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>realloc(A.hiddens,
+            n.states * n.hiddens * sizeof(A.hiddens[0]))
+        A.is_valid = <int*>realloc(A.is_valid,
+            n.states * n.classes * sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    A._curr_size = n.states
+
+
+cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
+        const WeightsC* W, SizesC n) nogil:
+    cdef double one = 1.0
+    resize_activations(A, n)
+    for i in range(n.states):
+        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
+    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
+    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
+    sum_state_features(cblas, A.unmaxed,
+        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
+    for i in range(n.states):
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        for j in range(n.hiddens):
+            index = i * n.hiddens * n.pieces + j * n.pieces
+            which = _arg_max(&A.unmaxed[index], n.pieces)
+            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
+    memset(A.scores, 0, n.states * n.classes * sizeof(float))
+    if W.hidden_weights == NULL:
+        memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
+    else:
+        # Compute hidden-to-output
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
+            1.0, <const float *>A.hiddens, n.hiddens,
+            <const float *>W.hidden_weights, n.hiddens,
+            0.0, A.scores, n.classes)
+        # Add bias
+        for i in range(n.states):
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
+    # Set unseen classes to minimum value
+    i = 0
+    min_ = A.scores[0]
+    for i in range(1, n.states * n.classes):
+        if A.scores[i] < min_:
+            min_ = A.scores[i]
+    for i in range(n.states):
+        for j in range(n.classes):
+            if not W.seen_classes[j]:
+                A.scores[i*n.classes+j] = min_
+
+
+cdef void sum_state_features(CBlas cblas, float* output,
+        const float* cached, const int* token_ids, int B, int F, int O) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    padding = cached
+    cached += F * O
+    cdef int id_stride = F*O
+    cdef float one = 1.
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                feature = &padding[f*O]
+            else:
+                idx = token_ids[f] * id_stride + f*O
+                feature = &cached[idx]
+            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
+        token_ids += F
+
+
+cdef void cpu_log_loss(float* d_scores,
+        const float* costs, const int* is_valid, const float* scores,
+        int O) nogil:
+    """Do multi-label log loss"""
+    cdef double max_, gmax, Z, gZ
+    best = arg_max_if_gold(scores, costs, is_valid, O)
+    guess = _arg_max(scores, O)
+
+    if best == -1 or guess == -1:
+        # These shouldn't happen, but if they do, we want to make sure we don't
+        # cause an OOB access.
+        return
+    Z = 1e-10
+    gZ = 1e-10
+    max_ = scores[guess]
+    gmax = scores[best]
+    for i in range(O):
+        Z += exp(scores[i] - max_)
+        if costs[i] <= costs[best]:
+            gZ += exp(scores[i] - gmax)
+    for i in range(O):
+        if costs[i] <= costs[best]:
+            d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
+        else:
+            d_scores[i] = exp(scores[i]-max_) / Z
+
+
+cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
+        const int* is_valid, int n) nogil:
+    # Find minimum cost
+    cdef float cost = 1
+    for i in range(n):
+        if is_valid[i] and costs[i] < cost:
+            cost = costs[i]
+    # Now find best-scoring with that cost
+    cdef int best = -1
+    for i in range(n):
+        if costs[i] <= cost and is_valid[i]:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
+
+
+cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if is_valid[i] >= 1:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
+
+
+
+class ParserStepModel(Model):
+    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
+            dropout=0.1):
+        Model.__init__(self, name="parser_step_model", forward=step_forward)
+        self.attrs["has_upper"] = has_upper
+        self.attrs["dropout_rate"] = dropout
+        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
+        if layers[1].get_dim("nP") >= 2:
+            activation = "maxout"
+        elif has_upper:
+            activation = None
+        else:
+            activation = "relu"
+        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
+                                            activation=activation, train=train)
+        if has_upper:
+            self.vec2scores = layers[-1]
+        else:
+            self.vec2scores = None
+        self.cuda_stream = util.get_cuda_stream(non_blocking=True)
+        self.backprops = []
+        self._class_mask = numpy.zeros((self.nO,), dtype='f')
+        self._class_mask.fill(1)
+        if unseen_classes is not None:
+            for class_ in unseen_classes:
+                self._class_mask[class_] = 0.
+
+    def clear_memory(self):
+        del self.tokvecs
+        del self.bp_tokvecs
+        del self.state2vec
+        del self.backprops
+        del self._class_mask
+
+    @property
+    def nO(self):
+        if self.attrs["has_upper"]:
+            return self.vec2scores.get_dim("nO")
+        else:
+            return self.state2vec.get_dim("nO")
+
+    def class_is_unseen(self, class_):
+        return self._class_mask[class_]
+
+    def mark_class_unseen(self, class_):
+        self._class_mask[class_] = 0
+
+    def mark_class_seen(self, class_):
+        self._class_mask[class_] = 1
+
+    def get_token_ids(self, states):
+        cdef StateClass state
+        states = [state for state in states if not state.is_final()]
+        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
+                                          dtype='i', order='C')
+        ids.fill(-1)
+        c_ids = <int*>ids.data
+        for state in states:
+            state.c.set_context_tokens(c_ids, ids.shape[1])
+            c_ids += ids.shape[1]
+        return ids
+
+    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
+        if isinstance(self.state2vec.ops, CupyOps) \
+        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+            # Move token_ids and d_vector to GPU, asynchronously
+            self.backprops.append((
+                util.get_async(self.cuda_stream, token_ids),
+                util.get_async(self.cuda_stream, d_vector),
+                get_d_tokvecs
+            ))
+        else:
+            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
+
+
+    def finish_steps(self, golds):
+        # Add a padding vector to the d_tokvecs gradient, so that missing
+        # values don't affect the real gradient.
+        d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
+        # Tells CUDA to block, so our async copies complete.
+        if self.cuda_stream is not None:
+            self.cuda_stream.synchronize()
+        for ids, d_vector, bp_vector in self.backprops:
+            d_state_features = bp_vector((d_vector, ids))
+            ids = ids.flatten()
+            d_state_features = d_state_features.reshape(
+                (ids.size, d_state_features.shape[2]))
+            self.ops.scatter_add(d_tokvecs, ids,
+                d_state_features)
+        # Padded -- see update()
+        self.bp_tokvecs(d_tokvecs[:-1])
+        return d_tokvecs
+
+NUMPY_OPS = NumpyOps()
+
+def step_forward(model: ParserStepModel, states, is_train):
+    token_ids = model.get_token_ids(states)
+    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
+    mask = None
+    if model.attrs["has_upper"]:
+        dropout_rate = model.attrs["dropout_rate"]
+        if is_train and dropout_rate > 0:
+            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
+            vector *= mask
+        scores, get_d_vector = model.vec2scores(vector, is_train)
+    else:
+        scores = NumpyOps().asarray(vector)
+        get_d_vector = lambda d_scores: d_scores
+    # If the class is unseen, make sure its score is minimum
+    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
+
+    def backprop_parser_step(d_scores):
+        # Zero vectors for unseen classes
+        d_scores *= model._class_mask
+        d_vector = get_d_vector(d_scores)
+        if mask is not None:
+            d_vector *= mask
+        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
+        return None
+    return scores, backprop_parser_step
+
+
+cdef class precompute_hiddens:
+    """Allow a model to be "primed" by pre-computing input features in bulk.
+
+    This is used for the parser, where we want to take a batch of documents,
+    and compute vectors for each (token, position) pair. These vectors can then
+    be reused, especially for beam-search.
+
+    Let's say we're using 12 features for each state, e.g. word at start of
+    buffer, three words on stack, their children, etc. In the normal arc-eager
+    system, a document of length N is processed in 2*N states. This means we'll
+    create 2*N*12 feature vectors --- but if we pre-compute, we only need
+    N*12 vector computations. The saving for beam-search is much better:
+    if we have a beam of k, we'll normally make 2*N*12*K computations --
+    so we can save the factor k. This also gives a nice CPU/GPU division:
+    we can do all our hard maths up front, packed into large multiplications,
+    and do the hard-to-program parsing on the CPU.
+    """
+    cdef readonly int nF, nO, nP
+    cdef bint _is_synchronized
+    cdef public object ops
+    cdef public object numpy_ops
+    cdef public object _cpu_ops
+    cdef np.ndarray _features
+    cdef np.ndarray _cached
+    cdef np.ndarray bias
+    cdef object _cuda_stream
+    cdef object _bp_hiddens
+    cdef object activation
+
+    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
+                 activation="maxout", train=False):
+        gpu_cached, bp_features = lower_model(tokvecs, train)
+        cdef np.ndarray cached
+        if not isinstance(gpu_cached, numpy.ndarray):
+            # Note the passing of cuda_stream here: it lets
+            # cupy make the copy asynchronously.
+            # We then have to block before first use.
+            cached = gpu_cached.get(stream=cuda_stream)
+        else:
+            cached = gpu_cached
+        if not isinstance(lower_model.get_param("b"), numpy.ndarray):
+            self.bias = lower_model.get_param("b").get(stream=cuda_stream)
+        else:
+            self.bias = lower_model.get_param("b")
+        self.nF = cached.shape[1]
+        if lower_model.has_dim("nP"):
+            self.nP = lower_model.get_dim("nP")
+        else:
+            self.nP = 1
+        self.nO = cached.shape[2]
+        self.ops = lower_model.ops
+        self.numpy_ops = NumpyOps()
+        self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
+        assert activation in (None, "relu", "maxout")
+        self.activation = activation
+        self._is_synchronized = False
+        self._cuda_stream = cuda_stream
+        self._cached = cached
+        self._bp_hiddens = bp_features
+
+    cdef const float* get_feat_weights(self) except NULL:
+        if not self._is_synchronized and self._cuda_stream is not None:
+            self._cuda_stream.synchronize()
+            self._is_synchronized = True
+        return <float*>self._cached.data
+
+    def has_dim(self, name):
+        if name == "nF":
+            return self.nF if self.nF is not None else True
+        elif name == "nP":
+            return self.nP if self.nP is not None else True
+        elif name == "nO":
+            return self.nO if self.nO is not None else True
+        else:
+            return False
+
+    def get_dim(self, name):
+        if name == "nF":
+            return self.nF
+        elif name == "nP":
+            return self.nP
+        elif name == "nO":
+            return self.nO
+        else:
+            raise ValueError(Errors.E1033.format(name=name))
+
+    def set_dim(self, name, value):
+        if name == "nF":
+            self.nF = value
+        elif name == "nP":
+            self.nP = value
+        elif name == "nO":
+            self.nO = value
+        else:
+            raise ValueError(Errors.E1033.format(name=name))
+
+    def __call__(self, X, bint is_train):
+        if is_train:
+            return self.begin_update(X)
+        else:
+            return self.predict(X), lambda X: X
+
+    def predict(self, X):
+        return self.begin_update(X)[0]
+
+    def begin_update(self, token_ids):
+        cdef np.ndarray state_vector = numpy.zeros(
+            (token_ids.shape[0], self.nO, self.nP), dtype='f')
+        # This is tricky, but (assuming GPU available);
+        # - Input to forward on CPU
+        # - Output from forward on CPU
+        # - Input to backward on GPU!
+        # - Output from backward on GPU
+        bp_hiddens = self._bp_hiddens
+
+        cdef CBlas cblas = self._cpu_ops.cblas()
+
+        feat_weights = self.get_feat_weights()
+        cdef int[:, ::1] ids = token_ids
+        sum_state_features(cblas, <float*>state_vector.data,
+            feat_weights, &ids[0,0],
+            token_ids.shape[0], self.nF, self.nO*self.nP)
+        state_vector += self.bias
+        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
+
+        def backward(d_state_vector_ids):
+            d_state_vector, token_ids = d_state_vector_ids
+            d_state_vector = bp_nonlinearity(d_state_vector)
+            d_tokens = bp_hiddens((d_state_vector, token_ids))
+            return d_tokens
+        return state_vector, backward
+
+    def _nonlinearity(self, state_vector):
+        if self.activation == "maxout":
+            return self._maxout_nonlinearity(state_vector)
+        else:
+            return self._relu_nonlinearity(state_vector)
+
+    def _maxout_nonlinearity(self, state_vector):
+        state_vector, mask = self.numpy_ops.maxout(state_vector)
+        # We're outputting to CPU, but we need this variable on GPU for the
+        # backward pass.
+        mask = self.ops.asarray(mask)
+
+        def backprop_maxout(d_best):
+            return self.ops.backprop_maxout(d_best, mask, self.nP)
+        
+        return state_vector, backprop_maxout
+
+    def _relu_nonlinearity(self, state_vector):
+        state_vector = state_vector.reshape((state_vector.shape[0], -1))
+        mask = state_vector >= 0.
+        state_vector *= mask
+        # We're outputting to CPU, but we need this variable on GPU for the
+        # backward pass.
+        mask = self.ops.asarray(mask)
+
+        def backprop_relu(d_best):
+            d_best *= mask
+            return d_best.reshape((d_best.shape + (1,)))
+ 
+        return state_vector, backprop_relu
+
+cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
diff --git a/spacy/ml/tb_framework.pxd b/spacy/ml/tb_framework.pxd
deleted file mode 100644
index 965508519e8..00000000000
--- a/spacy/ml/tb_framework.pxd
+++ /dev/null
@@ -1,28 +0,0 @@
-from libc.stdint cimport int8_t
-
-
-cdef struct SizesC:
-    int states
-    int classes
-    int hiddens
-    int pieces
-    int feats
-    int embed_width
-    int tokens
-
-
-cdef struct WeightsC:
-    const float* feat_weights
-    const float* feat_bias
-    const float* hidden_bias
-    const float* hidden_weights
-    const int8_t* seen_mask
-
-
-cdef struct ActivationsC:
-    int* token_ids
-    float* unmaxed
-    float* hiddens
-    int* is_valid
-    int _curr_size
-    int _max_size
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
new file mode 100644
index 00000000000..ab4a969e24e
--- /dev/null
+++ b/spacy/ml/tb_framework.py
@@ -0,0 +1,50 @@
+from thinc.api import Model, noop
+from .parser_model import ParserStepModel
+from ..util import registry
+
+
+@registry.layers("spacy.TransitionModel.v1")
+def TransitionModel(
+    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
+):
+    """Set up a stepwise transition-based model"""
+    if upper is None:
+        has_upper = False
+        upper = noop()
+    else:
+        has_upper = True
+    # don't define nO for this object, because we can't dynamically change it
+    return Model(
+        name="parser_model",
+        forward=forward,
+        dims={"nI": tok2vec.maybe_get_dim("nI")},
+        layers=[tok2vec, lower, upper],
+        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
+        init=init,
+        attrs={
+            "has_upper": has_upper,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def forward(model, X, is_train):
+    step_model = ParserStepModel(
+        X,
+        model.layers,
+        unseen_classes=model.attrs["unseen_classes"],
+        train=is_train,
+        has_upper=model.attrs["has_upper"],
+    )
+
+    return step_model, step_model.finish_steps
+
+
+def init(model, X=None, Y=None):
+    model.get_ref("tok2vec").initialize(X=X)
+    lower = model.get_ref("lower")
+    lower.initialize()
+    if model.attrs["has_upper"]:
+        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
+        model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
deleted file mode 100644
index e497643f0cd..00000000000
--- a/spacy/ml/tb_framework.pyx
+++ /dev/null
@@ -1,639 +0,0 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, cast
-
-from libc.stdlib cimport calloc, free, realloc
-from libc.string cimport memcpy, memset
-from libcpp.vector cimport vector
-
-import numpy
-
-cimport numpy as np
-
-from thinc.api import (
-    Linear,
-    Model,
-    NumpyOps,
-    chain,
-    glorot_uniform_init,
-    list2array,
-    normal_init,
-    uniform_init,
-    zero_init,
-)
-
-from thinc.backends.cblas cimport CBlas, saxpy, sgemm
-
-from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
-
-from ..errors import Errors
-from ..pipeline._parser_internals import _beam_utils
-from ..pipeline._parser_internals.batch import GreedyBatch
-
-from ..pipeline._parser_internals._parser_utils cimport arg_max
-from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
-from ..pipeline._parser_internals.transition_system cimport (
-    TransitionSystem,
-    c_apply_actions,
-    c_transition_batch,
-)
-
-from ..tokens.doc import Doc
-from ..util import registry
-
-State = Any  # TODO
-
-
-@registry.layers("spacy.TransitionModel.v2")
-def TransitionModel(
-    *,
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    beam_width: int = 1,
-    beam_density: float = 0.0,
-    state_tokens: int,
-    hidden_width: int,
-    maxout_pieces: int,
-    nO: Optional[int] = None,
-    unseen_classes=set(),
-) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
-    """Set up a transition-based parsing model, using a maxout hidden
-    layer and a linear output layer.
-    """
-    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
-    tok2vec_projected.set_dim("nO", hidden_width)
-
-    # FIXME: we use `output` as a container for the output layer's
-    # weights and biases. Thinc optimizers cannot handle resizing
-    # of parameters. So, when the parser model is resized, we
-    # construct a new `output` layer, which has a different key in
-    # the optimizer. Once the optimizer supports parameter resizing,
-    # we can replace the `output` layer by `output_W` and `output_b`
-    # parameters in this model.
-    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
-
-    return Model(
-        name="parser_model",
-        forward=forward,
-        init=init,
-        layers=[tok2vec_projected, output],
-        refs={
-            "tok2vec": tok2vec_projected,
-            "output": output,
-        },
-        params={
-            "hidden_W": None,  # Floats2d W for the hidden layer
-            "hidden_b": None,  # Floats1d bias for the hidden layer
-            "hidden_pad": None,  # Floats1d padding for the hidden layer
-        },
-        dims={
-            "nO": None,  # Output size
-            "nP": maxout_pieces,
-            "nH": hidden_width,
-            "nI": tok2vec_projected.maybe_get_dim("nO"),
-            "nF": state_tokens,
-        },
-        attrs={
-            "beam_width": beam_width,
-            "beam_density": beam_density,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def resize_output(model: Model, new_nO: int) -> Model:
-    old_nO = model.maybe_get_dim("nO")
-    output = model.get_ref("output")
-    if old_nO is None:
-        model.set_dim("nO", new_nO)
-        output.set_dim("nO", new_nO)
-        output.initialize()
-        return model
-    elif new_nO <= old_nO:
-        return model
-    elif output.has_param("W"):
-        nH = model.get_dim("nH")
-        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
-        new_output.initialize()
-        new_W = new_output.get_param("W")
-        new_b = new_output.get_param("b")
-        old_W = output.get_param("W")
-        old_b = output.get_param("b")
-        new_W[:old_nO] = old_W  # type: ignore
-        new_b[:old_nO] = old_b  # type: ignore
-        for i in range(old_nO, new_nO):
-            model.attrs["unseen_classes"].add(i)
-        model.layers[-1] = new_output
-        model.set_ref("output", new_output)
-    # TODO: Avoid this private intrusion
-    model._dims["nO"] = new_nO
-    return model
-
-
-def init(
-    model,
-    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
-    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
-):
-    if X is not None:
-        docs, _ = X
-        model.get_ref("tok2vec").initialize(X=docs)
-    else:
-        model.get_ref("tok2vec").initialize()
-    inferred_nO = _infer_nO(Y)
-    if inferred_nO is not None:
-        current_nO = model.maybe_get_dim("nO")
-        if current_nO is None or current_nO != inferred_nO:
-            model.attrs["resize_output"](model, inferred_nO)
-    nP = model.get_dim("nP")
-    nH = model.get_dim("nH")
-    nI = model.get_dim("nI")
-    nF = model.get_dim("nF")
-    ops = model.ops
-
-    Wl = ops.alloc2f(nH * nP, nF * nI)
-    bl = ops.alloc1f(nH * nP)
-    padl = ops.alloc1f(nI)
-    # Wl = zero_init(ops, Wl.shape)
-    Wl = glorot_uniform_init(ops, Wl.shape)
-    padl = uniform_init(ops, padl.shape)  # type: ignore
-    # TODO: Experiment with whether better to initialize output_W
-    model.set_param("hidden_W", Wl)
-    model.set_param("hidden_b", bl)
-    model.set_param("hidden_pad", padl)
-    # model = _lsuv_init(model)
-    return model
-
-
-class TransitionModelInputs:
-    """
-    Input to transition model.
-    """
-
-    # dataclass annotation is not yet supported in Cython 0.29.x,
-    # so, we'll do something close to it.
-
-    actions: Optional[List[Ints1d]]
-    docs: List[Doc]
-    max_moves: int
-    moves: TransitionSystem
-    states: Optional[List[State]]
-
-    __slots__ = [
-        "actions",
-        "docs",
-        "max_moves",
-        "moves",
-        "states",
-    ]
-
-    def __init__(
-        self,
-        docs: List[Doc],
-        moves: TransitionSystem,
-        actions: Optional[List[Ints1d]] = None,
-        max_moves: int = 0,
-        states: Optional[List[State]] = None,
-    ):
-        """
-        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
-        docs (List[Doc]): Docs to predict transition sequences for.
-        max_moves: (int): the maximum number of moves to apply, values less
-            than 1 will apply moves to states until they are final states.
-        moves (TransitionSystem): the transition system to use when predicting
-            the transition sequences.
-        states (Optional[List[States]]): the initial states to predict the
-            transition sequences for. When absent, the initial states are
-            initialized from the provided Docs.
-        """
-        self.actions = actions
-        self.docs = docs
-        self.moves = moves
-        self.max_moves = max_moves
-        self.states = states
-
-
-def forward(model, inputs: TransitionModelInputs, is_train: bool):
-    docs = inputs.docs
-    moves = inputs.moves
-    actions = inputs.actions
-
-    beam_width = model.attrs["beam_width"]
-    hidden_pad = model.get_param("hidden_pad")
-    tok2vec = model.get_ref("tok2vec")
-
-    states = moves.init_batch(docs) if inputs.states is None else inputs.states
-    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
-    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
-    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
-    seen_mask = _get_seen_mask(model)
-
-    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
-        # Note: max_moves is only used during training, so we don't need to
-        #       pass it to the greedy inference path.
-        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
-    else:
-        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
-                                 feats, backprop_feats, seen_mask, is_train, actions=actions,
-                                 max_moves=inputs.max_moves)
-
-
-def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
-                        np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None):
-    cdef vector[StateC*] c_states
-    cdef StateClass state
-    for state in states:
-        if not state.is_final():
-            c_states.push_back(state.c)
-    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
-    # Precomputed features have rows for each token, plus one for padding.
-    cdef int n_tokens = feats.shape[0] - 1
-    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
-    cdef CBlas cblas = model.ops.cblas()
-    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
-
-    def backprop(dY):
-        raise ValueError(Errors.E4004)
-
-    return (states, scores), backprop
-
-
-cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
-                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i
-    cdef vector[StateC *] unfinished
-    cdef ActivationsC activations = _alloc_activations(sizes)
-    cdef np.ndarray step_scores
-    cdef np.ndarray step_actions
-
-    scores = []
-    while sizes.states >= 1:
-        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
-        step_actions = actions[0] if actions is not None else None
-        with nogil:
-            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
-            if actions is None:
-                # Validate actions, argmax, take action.
-                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
-                                   sizes.states)
-            else:
-                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
-            for i in range(sizes.states):
-                if not states[i].is_final():
-                    unfinished.push_back(states[i])
-            for i in range(unfinished.size()):
-                states[i] = unfinished[i]
-        sizes.states = unfinished.size()
-        scores.append(step_scores)
-        unfinished.clear()
-        actions = actions[1:] if actions is not None else None
-    _free_activations(&activations)
-
-    return scores
-
-
-def _forward_fallback(
-    model: Model,
-    moves: TransitionSystem,
-    states: List[StateClass],
-    tokvecs, backprop_tok2vec,
-    feats,
-    backprop_feats,
-    seen_mask,
-    is_train: bool,
-    actions: Optional[List[Ints1d]] = None,
-    max_moves: int = 0,
-):
-    nF = model.get_dim("nF")
-    output = model.get_ref("output")
-    hidden_b = model.get_param("hidden_b")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-
-    beam_width = model.attrs["beam_width"]
-    beam_density = model.attrs["beam_density"]
-
-    ops = model.ops
-
-    all_ids = []
-    all_which = []
-    all_statevecs = []
-    all_scores = []
-    if beam_width == 1:
-        batch = GreedyBatch(moves, states, None)
-    else:
-        batch = _beam_utils.BeamBatch(
-            moves, states, None, width=beam_width, density=beam_density
-        )
-    arange = ops.xp.arange(nF)
-    n_moves = 0
-    while not batch.is_done:
-        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
-        for i, state in enumerate(batch.get_unfinished_states()):
-            state.set_context_tokens(ids, i, nF)
-        # Sum the state features, add the bias and apply the activation (maxout)
-        # to create the state vectors.
-        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
-        preacts2f += hidden_b
-        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
-        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
-        statevecs, which = ops.maxout(preacts)
-        # We don't use output's backprop, since we want to backprop for
-        # all states at once, rather than a single state.
-        scores = output.predict(statevecs)
-        scores[:, seen_mask] = ops.xp.nanmin(scores)
-        # Transition the states, filtering out any that are finished.
-        cpu_scores = ops.to_numpy(scores)
-        if actions is None:
-            batch.advance(cpu_scores)
-        else:
-            batch.advance_with_actions(actions[0])
-            actions = actions[1:]
-        all_scores.append(scores)
-        if is_train:
-            # Remember intermediate results for the backprop.
-            all_ids.append(ids)
-            all_statevecs.append(statevecs)
-            all_which.append(which)
-        if n_moves >= max_moves >= 1:
-            break
-        n_moves += 1
-
-    def backprop_parser(d_states_d_scores):
-        ids = ops.xp.vstack(all_ids)
-        which = ops.xp.vstack(all_which)
-        statevecs = ops.xp.vstack(all_statevecs)
-        _, d_scores = d_states_d_scores
-        if model.attrs.get("unseen_classes"):
-            # If we have a negative gradient (i.e. the probability should
-            # increase) on any classes we filtered out as unseen, mark
-            # them as seen.
-            for clas in set(model.attrs["unseen_classes"]):
-                if (d_scores[:, clas] < 0).any():
-                    model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False  # no-cython-lint
-        # Calculate the gradients for the parameters of the output layer.
-        # The weight gemm is (nS, nO) @ (nS, nH).T
-        output.inc_grad("b", d_scores.sum(axis=0))
-        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
-        # Now calculate d_statevecs, by backproping through the output linear layer.
-        # This gemm is (nS, nO) @ (nO, nH)
-        output_W = output.get_param("W")
-        d_statevecs = ops.gemm(d_scores, output_W)
-        # Backprop through the maxout activation
-        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
-        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
-        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
-        # We don't need to backprop the summation, because we pass back the IDs instead
-        d_state_features = backprop_feats((d_preacts2f, ids))
-        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
-        ops.scatter_add(d_tokvecs, ids, d_state_features)
-        model.inc_grad("hidden_pad", d_tokvecs[-1])
-        return (backprop_tok2vec(d_tokvecs[:-1]), None)
-
-    return (list(batch), all_scores), backprop_parser
-
-
-def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
-    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
-    for class_ in model.attrs.get("unseen_classes", set()):
-        mask[class_] = True
-    return mask
-
-
-def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
-    W: Floats2d = model.get_param("hidden_W")
-    nF = model.get_dim("nF")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
-    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
-    W3f = W3f.transpose((1, 0, 2))
-    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
-    assert X.shape == (X.shape[0], nI), X.shape
-    Yf_ = model.ops.gemm(X, W2f, trans2=True)
-    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
-
-    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nH, nP), and get back:
-        # (nB, nH, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nH, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        dXf = model.ops.gemm(dY, W)
-        Xf = X[ids].reshape((ids.shape[0], -1))
-        dW = model.ops.gemm(dY, Xf, trans1=True)
-        model.inc_grad("hidden_W", dW)
-        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
-
-    return Yf, backward
-
-
-def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
-    if Y is None:
-        return None
-    _, scores = Y
-    if len(scores) == 0:
-        return None
-    assert scores[0].shape[0] >= 1
-    assert len(scores[0].shape) == 2
-    return scores[0].shape[1]
-
-
-def _lsuv_init(model: Model):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    W = model.maybe_get_param("hidden_W")
-    if W is not None and W.any():
-        return
-
-    nF = model.get_dim("nF")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nH, nP, nI)
-    b = model.ops.alloc2f(nH, nP)
-    pad = model.ops.alloc4f(1, nF, nH, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc_f((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc_f((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
-        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
-        vectors3f += b
-        return model.ops.maxout(vectors3f)[0]
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = cast(Floats4d, model.get_param("hidden_W").copy())
-    b = cast(Floats2d, model.get_param("hidden_b").copy())
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("hidden_W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("hidden_b", b)
-        else:
-            break
-    return model
-
-
-cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
-    output = model.get_ref("output")
-    cdef np.ndarray hidden_b = model.get_param("hidden_b")
-    cdef np.ndarray output_W = output.get_param("W")
-    cdef np.ndarray output_b = output.get_param("b")
-
-    cdef WeightsC weights
-    weights.feat_weights = feats
-    weights.feat_bias = <const float*>hidden_b.data
-    weights.hidden_weights = <const float *> output_W.data
-    weights.hidden_bias = <const float *> output_b.data
-    weights.seen_mask = <const int8_t*> seen_mask.data
-
-    return weights
-
-
-cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
-    cdef SizesC sizes
-    sizes.states = batch_size
-    sizes.classes = model.get_dim("nO")
-    sizes.hiddens = model.get_dim("nH")
-    sizes.pieces = model.get_dim("nP")
-    sizes.feats = model.get_dim("nF")
-    sizes.embed_width = model.get_dim("nI")
-    sizes.tokens = tokens
-    return sizes
-
-
-cdef ActivationsC _alloc_activations(SizesC n) nogil:
-    cdef ActivationsC A
-    memset(&A, 0, sizeof(A))
-    _resize_activations(&A, n)
-    return A
-
-
-cdef void _free_activations(const ActivationsC* A) nogil:
-    free(A.token_ids)
-    free(A.unmaxed)
-    free(A.hiddens)
-    free(A.is_valid)
-
-
-cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
-    if n.states <= A._max_size:
-        A._curr_size = n.states
-        return
-    if A._max_size == 0:
-        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
-        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
-        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    else:
-        A.token_ids = <int*>realloc(A.token_ids,
-                                    n.states * n.feats * sizeof(A.token_ids[0]))
-        A.unmaxed = <float*>realloc(A.unmaxed,
-                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>realloc(A.hiddens,
-                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
-        A.is_valid = <int*>realloc(A.is_valid,
-                                   n.states * n.classes * sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    A._curr_size = n.states
-
-
-cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
-    _resize_activations(A, n)
-    for i in range(n.states):
-        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
-    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
-    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
-    for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
-        for j in range(n.hiddens):
-            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = arg_max(&A.unmaxed[index], n.pieces)
-            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
-    if W.hidden_weights == NULL:
-        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
-    else:
-        # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-                     1.0, <const float *>A.hiddens, n.hiddens,
-                     <const float *>W.hidden_weights, n.hiddens,
-                     0.0, scores, n.classes)
-        # Add bias
-        for i in range(n.states):
-            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
-    # Set unseen classes to minimum value
-    i = 0
-    min_ = scores[0]
-    for i in range(1, n.states * n.classes):
-        if scores[i] < min_:
-            min_ = scores[i]
-    for i in range(n.states):
-        for j in range(n.classes):
-            if W.seen_mask[j]:
-                scores[i*n.classes+j] = min_
-
-
-cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
-                              const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f
-    cdef const float* feature
-    cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces  # no-cython-lint
-    cdef int F = n.feats
-    cdef int T = n.tokens
-    padding = cached + (T * F * O)
-    cdef int id_stride = F*O
-    cdef float one = 1.
-    for b in range(B):
-        for f in range(F):
-            if token_ids[f] < 0:
-                feature = &padding[f*O]
-            else:
-                idx = token_ids[f] * id_stride + f*O
-                feature = &cached[idx]
-            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
-        token_ids += F
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pxd b/spacy/pipeline/_parser_internals/_parser_utils.pxd
deleted file mode 100644
index 7fee05bad60..00000000000
--- a/spacy/pipeline/_parser_internals/_parser_utils.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef int arg_max(const float* scores, const int n_classes) nogil
-cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pyx b/spacy/pipeline/_parser_internals/_parser_utils.pyx
deleted file mode 100644
index 582756bf5be..00000000000
--- a/spacy/pipeline/_parser_internals/_parser_utils.pyx
+++ /dev/null
@@ -1,22 +0,0 @@
-# cython: infer_types=True
-
-cdef inline int arg_max(const float* scores, const int n_classes) nogil:
-    if n_classes == 2:
-        return 0 if scores[0] > scores[1] else 1
-    cdef int i
-    cdef int best = 0
-    cdef float mode = scores[0]
-    for i in range(1, n_classes):
-        if scores[i] > mode:
-            mode = scores[i]
-            best = i
-    return best
-
-
-cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
-    cdef int best = -1
-    for i in range(n):
-        if is_valid[i] >= 1:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 1c61ac271d8..04274ce8af1 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -7,6 +7,8 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.set cimport set
+from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
@@ -26,7 +28,7 @@ cdef struct ArcC:
 
 
 cdef cppclass StateC:
-    vector[int] _heads
+    int* _heads
     const TokenC* _sent
     vector[int] _stack
     vector[int] _rebuffer
@@ -34,34 +36,31 @@ cdef cppclass StateC:
     unordered_map[int, vector[ArcC]] _left_arcs
     unordered_map[int, vector[ArcC]] _right_arcs
     vector[libcpp.bool] _unshiftable
-    vector[int] history
     set[int] _sent_starts
     TokenC _empty_token
     int length
     int offset
     int _b_i
 
-    __init__(const TokenC* sent, int length) nogil except +:
-        this._heads.resize(length, -1)
-        this._unshiftable.resize(length, False)
-
-        # Reserve memory ahead of time to minimize allocations during parsing.
-        # The initial capacity set here ideally reflects the expected average-case/majority usage.
-        cdef int init_capacity = 32
-        this._stack.reserve(init_capacity)
-        this._rebuffer.reserve(init_capacity)
-        this._ents.reserve(init_capacity)
-        this._left_arcs.reserve(init_capacity)
-        this._right_arcs.reserve(init_capacity)
-        this.history.reserve(init_capacity)
-
+    __init__(const TokenC* sent, int length) nogil:
         this._sent = sent
+        this._heads = <int*>calloc(length, sizeof(int))
+        if not (this._sent and this._heads):
+            with gil:
+                PyErr_SetFromErrno(MemoryError)
+                PyErr_CheckSignals()
         this.offset = 0
         this.length = length
         this._b_i = 0
+        for i in range(length):
+            this._heads[i] = -1
+            this._unshiftable.push_back(0)
         memset(&this._empty_token, 0, sizeof(TokenC))
         this._empty_token.lex = &EMPTY_LEXEME
 
+    __dealloc__():
+        free(this._heads)
+
     void set_context_tokens(int* ids, int n) nogil:
         cdef int i, j
         if n == 1:
@@ -134,20 +133,19 @@ cdef cppclass StateC:
                 ids[i] = -1
 
     int S(int i) nogil const:
-        cdef int stack_size = this._stack.size()
-        if i >= stack_size or i < 0:
+        if i >= this._stack.size():
             return -1
-        else:
-            return this._stack[stack_size - (i+1)]
+        elif i < 0:
+            return -1
+        return this._stack.at(this._stack.size() - (i+1))
 
     int B(int i) nogil const:
-        cdef int buf_size = this._rebuffer.size()
         if i < 0:
             return -1
-        elif i < buf_size:
-            return this._rebuffer[buf_size - (i+1)]
+        elif i < this._rebuffer.size():
+            return this._rebuffer.at(this._rebuffer.size() - (i+1))
         else:
-            b_i = this._b_i + (i - buf_size)
+            b_i = this._b_i + (i - this._rebuffer.size())
             if b_i >= this.length:
                 return -1
             else:
@@ -246,7 +244,7 @@ cdef cppclass StateC:
             return 0
         elif this._sent[word].sent_start == 1:
             return 1
-        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
+        elif this._sent_starts.count(word) >= 1:
             return 1
         else:
             return 0
@@ -330,7 +328,7 @@ cdef cppclass StateC:
         if item >= this._unshiftable.size():
             return 0
         else:
-            return this._unshiftable[item]
+            return this._unshiftable.at(item)
 
     void set_reshiftable(int item) nogil:
         if item < this._unshiftable.size():
@@ -350,9 +348,6 @@ cdef cppclass StateC:
         this._heads[child] = head
 
     void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
-        cdef vector[ArcC]* arcs
-        cdef ArcC* arc
-
         arcs_it = heads_arcs.find(h_i)
         if arcs_it == heads_arcs.end():
             return
@@ -361,12 +356,12 @@ cdef cppclass StateC:
         if arcs.size() == 0:
             return
 
-        arc = &arcs.back()
+        arc = arcs.back()
         if arc.head == h_i and arc.child == c_i:
             arcs.pop_back()
         else:
             for i in range(arcs.size()-1):
-                arc = &deref(arcs)[i]
+                arc = arcs.at(i)
                 if arc.head == h_i and arc.child == c_i:
                     arc.head = -1
                     arc.child = -1
@@ -406,11 +401,10 @@ cdef cppclass StateC:
         this._rebuffer = src._rebuffer
         this._sent_starts = src._sent_starts
         this._unshiftable = src._unshiftable
-        this._heads = src._heads
+        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
         this._ents = src._ents
         this._left_arcs = src._left_arcs
         this._right_arcs = src._right_arcs
         this._b_i = src._b_i
         this.offset = src.offset
         this._empty_token = src._empty_token
-        this.history = src.history
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 462aa820e4f..9dda3bd5e44 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -779,8 +779,6 @@ cdef class ArcEager(TransitionSystem):
         return list(arcs)
 
     def has_gold(self, Example eg, start=0, end=None):
-        if end is not None and end < 0:
-            end = None
         for word in eg.y[start:end]:
             if word.dep != 0:
                 return True
@@ -865,7 +863,6 @@ cdef class ArcEager(TransitionSystem):
                             state.print_state()
                         )))
                     action.do(state.c, action.label)
-                    state.c.history.push_back(i)
                     break
             else:
                 failed = False
diff --git a/spacy/pipeline/_parser_internals/batch.pxd b/spacy/pipeline/_parser_internals/batch.pxd
deleted file mode 100644
index 60734e549aa..00000000000
--- a/spacy/pipeline/_parser_internals/batch.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef class Batch:
-    pass
diff --git a/spacy/pipeline/_parser_internals/batch.pyx b/spacy/pipeline/_parser_internals/batch.pyx
deleted file mode 100644
index 91073b52e68..00000000000
--- a/spacy/pipeline/_parser_internals/batch.pyx
+++ /dev/null
@@ -1,52 +0,0 @@
-from typing import Any
-
-TransitionSystem = Any  # TODO
-
-cdef class Batch:
-    def advance(self, scores):
-        raise NotImplementedError
-
-    def get_states(self):
-        raise NotImplementedError
-
-    @property
-    def is_done(self):
-        raise NotImplementedError
-
-    def get_unfinished_states(self):
-        raise NotImplementedError
-
-    def __getitem__(self, i):
-        raise NotImplementedError
-
-    def __len__(self):
-        raise NotImplementedError
-
-
-class GreedyBatch(Batch):
-    def __init__(self, moves: TransitionSystem, states, golds):
-        self._moves = moves
-        self._states = states
-        self._next_states = [s for s in states if not s.is_final()]
-
-    def advance(self, scores):
-        self._next_states = self._moves.transition_states(self._next_states, scores)
-
-    def advance_with_actions(self, actions):
-        self._next_states = self._moves.apply_actions(self._next_states, actions)
-
-    def get_states(self):
-        return self._states
-
-    @property
-    def is_done(self):
-        return all(s.is_final() for s in self._states)
-
-    def get_unfinished_states(self):
-        return [st for st in self._states if not st.is_final()]
-
-    def __getitem__(self, i):
-        return self._states[i]
-
-    def __len__(self):
-        return len(self._states)
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index a8c72f238f6..375a9866ed2 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -166,7 +166,7 @@ cdef class BiluoPushDown(TransitionSystem):
             if token.ent_type:
                 labels.add(token.ent_type_)
         return labels
-
+    
     def move_name(self, int move, attr_t label):
         if move == OUT:
             return 'O'
@@ -316,8 +316,6 @@ cdef class BiluoPushDown(TransitionSystem):
             for span in eg.y.spans.get(neg_key, []):
                 if span.start >= start and span.end <= end:
                     return True
-        if end is not None and end < 0:
-            end = None
         for word in eg.y[start:end]:
             if word.ent_iob != 0:
                 return True
@@ -663,7 +661,7 @@ cdef class Unit:
                 cost += 1
                 break
         return cost
-
+ 
 
 
 cdef class Out:
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index c2a0d22956a..f25408a13ba 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -21,10 +21,6 @@ cdef class StateClass:
         if self._borrowed != 1:
             del self.c
 
-    @property
-    def history(self):
-        return list(self.c.history)
-
     @property
     def stack(self):
         return [self.S(i) for i in range(self.c.stack_depth())]
@@ -181,6 +177,3 @@ cdef class StateClass:
 
     def clone(self, StateClass src):
         self.c.clone(src.c)
-
-    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
-        self.c.set_context_tokens(&output[row, 0], n_feats)
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 08baed932ba..04cd10d8864 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -57,10 +57,3 @@ cdef class TransitionSystem:
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        const StateC* state, gold) except -1
-
-
-cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-                          int batch_size) nogil
-
-cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-                             int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index ae1cf890f3e..4a0feb435dd 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -3,8 +3,6 @@
 from __future__ import print_function
 
 from cymem.cymem cimport Pool
-from libc.stdlib cimport calloc, free
-from libcpp.vector cimport vector
 
 from collections import Counter
 
@@ -76,18 +74,7 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
-    def follow_history(self, doc, history):
-        cdef int clas
-        cdef StateClass state = StateClass(doc)
-        for clas in history:
-            action = self.c[clas]
-            action.do(state.c, action.label)
-            state.c.history.push_back(clas)
-        return state
-
     def get_oracle_sequence(self, Example example, _debug=False):
-        if not self.has_gold(example):
-            return []
         states, golds, _ = self.init_gold_batch([example])
         if not states:
             return []
@@ -99,8 +86,6 @@ cdef class TransitionSystem:
             return self.get_oracle_sequence_from_state(state, gold)
 
     def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
-        if state.is_final():
-            return []
         cdef Pool mem = Pool()
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         assert self.n_moves > 0
@@ -126,7 +111,6 @@ cdef class TransitionSystem:
                             "S0 head?", str(state.has_head(state.S(0))),
                         )))
                     action.do(state.c, action.label)
-                    state.c.history.push_back(i)
                     break
             else:
                 if _debug:
@@ -154,28 +138,6 @@ cdef class TransitionSystem:
             raise ValueError(Errors.E170.format(name=name))
         action = self.lookup_transition(name)
         action.do(state.c, action.label)
-        state.c.history.push_back(action.clas)
-
-    def apply_actions(self, states, const int[::1] actions):
-        assert len(states) == actions.shape[0]
-        cdef StateClass state
-        cdef vector[StateC*] c_states
-        c_states.resize(len(states))
-        cdef int i
-        for (i, state) in enumerate(states):
-            c_states[i] = state.c
-        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
-        return [state for state in states if not state.c.is_final()]
-
-    def transition_states(self, states, float[:, ::1] scores):
-        assert len(states) == scores.shape[0]
-        cdef StateClass state
-        cdef float* c_scores = &scores[0, 0]
-        cdef vector[StateC*] c_states
-        for state in states:
-            c_states.push_back(state.c)
-        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
-        return [state for state in states if not state.c.is_final()]
 
     cdef Transition lookup_transition(self, object name) except *:
         raise NotImplementedError
@@ -288,34 +250,3 @@ cdef class TransitionSystem:
             self.cfg.update(msg['cfg'])
         self.initialize_actions(labels)
         return self
-
-
-cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-                          int batch_size) nogil:
-    cdef int i
-    cdef Transition action
-    cdef StateC* state
-    for i in range(batch_size):
-        state = states[i]
-        action = moves.c[actions[i]]
-        action.do(state, action.label)
-        state.history.push_back(action.clas)
-
-
-cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-                             int nr_class, int batch_size) nogil:
-    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
-    cdef int i, guess
-    cdef Transition action
-    for i in range(batch_size):
-        moves.set_valid(is_valid, states[i])
-        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
-        if guess == -1:
-            # This shouldn't happen, but it's hard to raise an error here,
-            # and we don't want to infinite loop. So, force to end state.
-            states[i].force_final()
-        else:
-            action = moves.c[guess]
-            action.do(states[i], action.label)
-            states[i].history.push_back(guess)
-    free(is_valid)
diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.pyx
similarity index 96%
rename from spacy/pipeline/dep_parser.py
rename to spacy/pipeline/dep_parser.pyx
index b4961487b83..3e59deaae4f 100644
--- a/spacy/pipeline/dep_parser.py
+++ b/spacy/pipeline/dep_parser.pyx
@@ -4,6 +4,10 @@
 
 from thinc.api import Config, Model
 
+from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser cimport Parser
+from ._parser_internals.arc_eager cimport ArcEager
+
 from ..language import Language
 from ..scorer import Scorer
 from ..training import remove_bilu_prefix
@@ -17,11 +21,12 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -121,7 +126,6 @@ def make_parser(
         scorer=scorer,
     )
 
-
 @Language.factory(
     "beam_parser",
     assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
@@ -227,7 +231,6 @@ def parser_score(examples, **kwargs):
 
     DOCS: https://spacy.io/api/dependencyparser#score
     """
-
     def has_sents(doc):
         return doc.has_annotation("SENT_START")
 
@@ -235,11 +238,8 @@ def dep_getter(token, attr):
         dep = getattr(token, attr)
         dep = token.vocab.strings.as_string(dep).lower()
         return dep
-
     results = {}
-    results.update(
-        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
-    )
+    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
     kwargs.setdefault("getter", dep_getter)
     kwargs.setdefault("ignore_labels", ("p", "punct"))
     results.update(Scorer.score_deps(examples, "dep", **kwargs))
@@ -252,12 +252,11 @@ def make_parser_scorer():
     return parser_score
 
 
-class DependencyParser(Parser):
+cdef class DependencyParser(Parser):
     """Pipeline component for dependency parsing.
 
     DOCS: https://spacy.io/api/dependencyparser
     """
-
     TransitionSystem = ArcEager
 
     def __init__(
@@ -277,7 +276,8 @@ def __init__(
         incorrect_spans_key=None,
         scorer=parser_score,
     ):
-        """Create a DependencyParser."""
+        """Create a DependencyParser.
+        """
         super().__init__(
             vocab,
             model,
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.pyx
similarity index 93%
rename from spacy/pipeline/ner.py
rename to spacy/pipeline/ner.pyx
index 1c7cf151385..c73ec5c528a 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.pyx
@@ -10,15 +10,22 @@
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
+from .transition_parser cimport Parser
+from ._parser_internals.ner cimport BiluoPushDown
+from ..language import Language
+from ..scorer import get_ner_prf, PRFScore
+from ..util import registry
+from ..training import remove_bilu_prefix
+
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -43,12 +50,8 @@
         "incorrect_spans_key": None,
         "scorer": {"@scorers": "spacy.ner_scorer.v1"},
     },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+
 )
 def make_ner(
     nlp: Language,
@@ -101,7 +104,6 @@ def make_ner(
         scorer=scorer,
     )
 
-
 @Language.factory(
     "beam_ner",
     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
@@ -115,12 +117,7 @@ def make_ner(
         "incorrect_spans_key": None,
         "scorer": None,
     },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 )
 def make_beam_ner(
     nlp: Language,
@@ -194,12 +191,11 @@ def make_ner_scorer():
     return ner_score
 
 
-class EntityRecognizer(Parser):
+cdef class EntityRecognizer(Parser):
     """Pipeline component for named entity recognition.
 
     DOCS: https://spacy.io/api/entityrecognizer
     """
-
     TransitionSystem = BiluoPushDown
 
     def __init__(
@@ -217,14 +213,15 @@ def __init__(
         incorrect_spans_key=None,
         scorer=ner_score,
     ):
-        """Create an EntityRecognizer."""
+        """Create an EntityRecognizer.
+        """
         super().__init__(
             vocab,
             model,
             name,
             moves,
             update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,  # not relevant for NER
+            min_action_freq=1,   # not relevant for NER
             learn_tokens=False,  # not relevant for NER
             beam_width=beam_width,
             beam_density=beam_density,
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
new file mode 100644
index 00000000000..f20e69a6e56
--- /dev/null
+++ b/spacy/pipeline/transition_parser.pxd
@@ -0,0 +1,21 @@
+from cymem.cymem cimport Pool
+from thinc.backends.cblas cimport CBlas
+
+from ..vocab cimport Vocab
+from .trainable_pipe cimport TrainablePipe
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ._parser_internals._state cimport StateC
+from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
+
+
+cdef class Parser(TrainablePipe):
+    cdef public object _rehearsal_model
+    cdef readonly TransitionSystem moves
+    cdef public object _multitasks
+    cdef object _cpu_ops
+
+    cdef void _parseC(self, CBlas cblas, StateC** states,
+            WeightsC weights, SizesC sizes) nogil
+
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
+            int nr_class, int batch_size) nogil
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index bb9b7653ce3..5d5fbb02790 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,6 +17,7 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
+import logging
 
 from ..util import make_tempdir
 
@@ -413,7 +414,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.initialize(get_examples=lambda: train_examples)
+    nlp.initialize()
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -540,11 +541,11 @@ def test_block_ner():
     assert [token.ent_type_ for token in doc] == expected_types
 
 
-def test_overfitting_IO():
-    fix_random_seed(1)
+@pytest.mark.parametrize("use_upper", [True, False])
+def test_overfitting_IO(use_upper):
     # Simple test to try and quickly overfit the NER component
     nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {}})
+    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -576,6 +577,7 @@ def test_overfitting_IO():
         assert ents2[0].label_ == "LOC"
         # Ensure that the predictions are still the same, even after adding a new label
         ner2 = nlp2.get_pipe("ner")
+        assert ner2.model.attrs["has_upper"] == use_upper
         ner2.add_label("RANDOM_NEW_LABEL")
         doc3 = nlp2(test_text)
         ents3 = doc3.ents
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index d25eb165acb..42cf5ced998 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,6 +1,3 @@
-import itertools
-
-import numpy
 import pytest
 from numpy.testing import assert_equal
 from thinc.api import Adam, fix_random_seed
@@ -62,8 +59,6 @@
     ),
 ]
 
-PARSERS = ["parser"]  # TODO: Test beam_parser when ready
-
 eps = 0.1
 
 
@@ -176,57 +171,6 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
     assert doc[0].dep != 0
 
 
-def test_parser_apply_actions(en_vocab, en_parser):
-    words = ["I", "ate", "pizza"]
-    words2 = ["Eat", "more", "pizza", "!"]
-    doc1 = Doc(en_vocab, words=words)
-    doc2 = Doc(en_vocab, words=words2)
-    docs = [doc1, doc2]
-
-    moves = en_parser.moves
-    moves.add_action(0, "")
-    moves.add_action(1, "")
-    moves.add_action(2, "nsubj")
-    moves.add_action(3, "obj")
-    moves.add_action(2, "amod")
-
-    actions = [
-        numpy.array([0, 0], dtype="i"),
-        numpy.array([2, 0], dtype="i"),
-        numpy.array([0, 4], dtype="i"),
-        numpy.array([3, 3], dtype="i"),
-        numpy.array([1, 1], dtype="i"),
-        numpy.array([1, 1], dtype="i"),
-        numpy.array([0], dtype="i"),
-        numpy.array([1], dtype="i"),
-    ]
-
-    states = moves.init_batch(docs)
-    active_states = states
-
-    for step_actions in actions:
-        active_states = moves.apply_actions(active_states, step_actions)
-
-    assert len(active_states) == 0
-
-    for (state, doc) in zip(states, docs):
-        moves.set_annotations(state, doc)
-
-    assert docs[0][0].head.i == 1
-    assert docs[0][0].dep_ == "nsubj"
-    assert docs[0][1].head.i == 1
-    assert docs[0][1].dep_ == "ROOT"
-    assert docs[0][2].head.i == 1
-    assert docs[0][2].dep_ == "obj"
-
-    assert docs[1][0].head.i == 0
-    assert docs[1][0].dep_ == "ROOT"
-    assert docs[1][1].head.i == 2
-    assert docs[1][1].dep_ == "amod"
-    assert docs[1][2].head.i == 0
-    assert docs[1][2].dep_ == "obj"
-
-
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
@@ -375,7 +319,7 @@ def test_parser_constructor(en_vocab):
     DependencyParser(en_vocab, model)
 
 
-@pytest.mark.parametrize("pipe_name", PARSERS)
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
 def test_incomplete_data(pipe_name):
     # Test that the parser works with incomplete information
     nlp = English()
@@ -401,15 +345,11 @@ def test_incomplete_data(pipe_name):
     assert doc[2].head.i == 1
 
 
-@pytest.mark.parametrize(
-    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
-)
-def test_overfitting_IO(pipe_name, max_moves):
-    fix_random_seed(0)
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+def test_overfitting_IO(pipe_name):
     # Simple test to try and quickly overfit the dependency parser (normal or beam)
     nlp = English()
     parser = nlp.add_pipe(pipe_name)
-    parser.cfg["update_with_oracle_cut_size"] = max_moves
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -511,12 +451,10 @@ def test_distill():
 @pytest.mark.parametrize(
     "parser_config",
     [
-        # TODO: re-enable after we have a spacy-legacy release for v4. See
-        # https://github.com/explosion/spacy-legacy/pull/36
-        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        # TransitionBasedParser V1
+        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        # TransitionBasedParser V2
         ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
-        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
     ],
 )
 # fmt: on
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index f6cefbc1f84..c3c4bb6c686 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -384,7 +384,7 @@ def test_replace_listeners():
     factory = "ner"
 
     [components.ner.model]
-    @architectures = "spacy.TransitionBasedParser.v3"
+    @architectures = "spacy.TransitionBasedParser.v2"
 
     [components.ner.model.tok2vec]
     @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index b351ea80121..8a1c74ca9ed 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -189,11 +189,33 @@
 
 parser_config_string_upper = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
+use_upper = true
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 333
+depth = 4
+embed_size = 5555
+window_size = 1
+maxout_pieces = 7
+subword_features = false
+"""
+
+
+parser_config_string_no_upper = """
+[model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "parser"
+extra_state_tokens = false
+hidden_width = 66
+maxout_pieces = 2
+use_upper = false
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
@@ -224,6 +246,7 @@ def my_parser():
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
+        use_upper=True,
     )
     return parser
 
@@ -337,16 +360,15 @@ def test_serialize_custom_nlp():
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        assert model.get_ref("tok2vec") is not None
-        assert model.has_param("hidden_W")
-        assert model.has_param("hidden_b")
-        output = model.get_ref("output")
-        assert output is not None
-        assert output.has_param("W")
-        assert output.has_param("b")
+        model.get_ref("tok2vec")
+        # check that we have the correct settings, not the default ones
+        assert model.get_ref("upper").get_dim("nI") == 65
+        assert model.get_ref("lower").get_dim("nI") == 65
 
 
-@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
+@pytest.mark.parametrize(
+    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
+)
 def test_serialize_parser(parser_config_string):
     """Create a non-default parser config to check nlp serializes it correctly"""
     nlp = English()
@@ -359,13 +381,11 @@ def test_serialize_parser(parser_config_string):
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        assert model.get_ref("tok2vec") is not None
-        assert model.has_param("hidden_W")
-        assert model.has_param("hidden_b")
-        output = model.get_ref("output")
-        assert output is not None
-        assert output.has_param("b")
-        assert output.has_param("W")
+        model.get_ref("tok2vec")
+        # check that we have the correct settings, not the default ones
+        if model.attrs["has_upper"]:
+            assert model.get_ref("upper").get_dim("nI") == 66
+        assert model.get_ref("lower").get_dim("nI") == 66
 
 
 def test_config_nlp_roundtrip():
@@ -561,7 +581,9 @@ def test_config_auto_fill_extra_fields():
     load_model_from_config(nlp.config)
 
 
-@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
+@pytest.mark.parametrize(
+    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
+)
 def test_config_validate_literal(parser_config_string):
     nlp = English()
     config = Config().from_str(parser_config_string)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index c05ef625e11..44717f6eba2 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,19 +1,22 @@
 import ctypes
 import os
 from pathlib import Path
-
 import pytest
-from pydantic import ValidationError
-from thinc.api import (
-    Config,
-    ConfigValidationError,
-    CupyOps,
-    MPSOps,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-    set_current_ops,
-)
+
+try:
+    from pydantic.v1 import ValidationError
+except ImportError:
+    from pydantic import ValidationError  # type: ignore
+
+from spacy.about import __version__ as spacy_version
+from spacy import util
+from spacy import prefer_gpu, require_gpu, require_cpu
+from spacy.ml._precomputable_affine import PrecomputableAffine
+from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
+from spacy.util import dot_to_object, SimpleFrozenList, import_file
+from spacy.util import to_ternary_int, find_available_port
+from thinc.api import Config, Optimizer, ConfigValidationError
+from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
@@ -96,6 +99,34 @@ def test_util_get_package_path(package):
     assert isinstance(path, Path)
 
 
+def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
+    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
+    assert model.get_param("W").shape == (nF, nO, nP, nI)
+    tensor = model.ops.alloc((10, nI))
+    Y, get_dX = model.begin_update(tensor)
+    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
+    dY = model.ops.alloc((15, nO, nP))
+    ids = model.ops.alloc((15, nF))
+    ids[1, 2] = -1
+    dY[1] = 1
+    assert not model.has_grad("pad")
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 2, 0, 0] == 1.0
+    ids.fill(0.0)
+    dY.fill(0.0)
+    dY[0] = 0
+    ids[1, 2] = 0
+    ids[1, 1] = -1
+    ids[1, 0] = -1
+    dY[1] = 1
+    ids[2, 0] = -1
+    dY[2] = 5
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 0, 0, 0] == 6
+    assert d_pad[0, 1, 0, 0] == 1
+    assert d_pad[0, 2, 0, 0] == 0
+
+
 def test_prefer_gpu():
     current_ops = get_current_ops()
     if has_cupy_gpu:
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index d121c9aa56f..89a6a10b6cb 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,5 +1,5 @@
 from collections.abc import Iterable as IterableInstance
-
+import warnings
 import numpy
 
 from murmurhash.mrmr cimport hash64
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index db8f974ea19..956234ac0d4 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -833,17 +833,18 @@ for a Tok2Vec layer.
 
 ## Parser & NER architectures {id="parser"}
 
-### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
+### spacy.TransitionBasedParser.v2 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TransitionBasedParser.v3"
+> @architectures = "spacy.TransitionBasedParser.v2"
 > state_type = "ner"
 > extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
+> use_upper = true
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v2"
@@ -873,22 +874,23 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                 | Description                                                                                                                                                       |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                        |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                               |
-| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
-| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                            |
-| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~                                                             |
-| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                       |
-| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                     |
+| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
+| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
 
 <Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>
 
 [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
 same signature, but the `use_upper` argument was `True` by default.
 
- </Accordion>
+</Accordion>
 
 ## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 9fddbbd01db..edf6892a1d5 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -406,7 +406,7 @@ Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v3
+Name       spacy.TransitionBasedParser.v1
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
@@ -416,7 +416,7 @@ Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v3
+Name       spacy.TransitionBasedParser.v1
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]
@@ -741,7 +741,7 @@ scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 - hidden_width = 64
@@ -764,7 +764,7 @@ scorer = {"@scorers":"spacy.parser_scorer.v1"}
 update_with_oracle_cut_size = 100
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx
index 44c80622437..b44df538766 100644
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@@ -302,7 +302,7 @@ the others, but may not be as accurate, especially if texts are short.
 ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 
 Identical to
-[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser)
+[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
 except the `use_upper` was set to `True` by default.
 
 ## Layers {id="layers"}
diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 1b0bc9606e9..2bd2856b6a3 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -140,7 +140,7 @@ factory = "tok2vec"
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v1"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -156,7 +156,7 @@ same. This makes them fully independent and doesn't require an upstream
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v1"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
@@ -472,7 +472,7 @@ sneakily delegates to the `Transformer` pipeline component.
 factory = "ner"
 
 [nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v1"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 128

From 8cca0dd4bafdb18ca09153d892408cae77732e65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:24:09 +0100
Subject: [PATCH 401/504] isort

---
 spacy/ml/models/parser.py                   |  9 +++++----
 spacy/ml/parser_model.pxd                   |  5 +++--
 spacy/ml/parser_model.pyx                   |  7 ++++---
 spacy/ml/tb_framework.py                    |  3 ++-
 spacy/pipeline/_parser_internals/_state.pxd |  3 +--
 spacy/pipeline/dep_parser.pyx               |  3 ++-
 spacy/pipeline/ner.pyx                      |  9 +++++----
 spacy/pipeline/transition_parser.pxd        |  6 +++---
 spacy/tests/parser/test_ner.py              |  1 -
 spacy/tests/test_misc.py                    | 20 +++++++++++---------
 10 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index a70d84dea8f..f6c0e565dd3 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,13 +1,14 @@
-from typing import Optional, List, cast
-from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
+from typing import List, Optional, cast
+
+from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
 from thinc.types import Floats2d
 
-from ...errors import Errors
 from ...compat import Literal
+from ...errors import Errors
+from ...tokens import Doc
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
-from ...tokens import Doc
 
 
 @registry.architectures("spacy.TransitionBasedParser.v2")
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
index 8def6cea53f..ca31c169964 100644
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@@ -1,7 +1,8 @@
-from libc.string cimport memset, memcpy
+from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport CBlas
-from ..typedefs cimport weight_t, hash_t
+
 from ..pipeline._parser_internals._state cimport StateC
+from ..typedefs cimport hash_t, weight_t
 
 
 cdef struct SizesC:
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 91558683b60..90e836f8a0a 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -1,18 +1,19 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
 cimport numpy as np
 from libc.math cimport exp
-from libc.string cimport memset, memcpy
 from libc.stdlib cimport calloc, free, realloc
+from libc.string cimport memcpy, memset
 from thinc.backends.cblas cimport saxpy, sgemm
 
 import numpy
 import numpy.random
-from thinc.api import Model, CupyOps, NumpyOps, get_ops
+from thinc.api import CupyOps, Model, NumpyOps, get_ops
 
 from .. import util
 from ..errors import Errors
-from ..typedefs cimport weight_t, class_t, hash_t
+
 from ..pipeline._parser_internals.stateclass cimport StateClass
+from ..typedefs cimport class_t, hash_t, weight_t
 
 
 cdef WeightsC get_c_weights(model) except *:
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
index ab4a969e24e..e351ad4e570 100644
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@@ -1,6 +1,7 @@
 from thinc.api import Model, noop
-from .parser_model import ParserStepModel
+
 from ..util import registry
+from .parser_model import ParserStepModel
 
 
 @registry.layers("spacy.TransitionModel.v1")
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 04274ce8af1..c063cf97cd4 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,4 +1,5 @@
 cimport libcpp
+from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as incr
 from libc.stdint cimport uint32_t, uint64_t
@@ -7,8 +8,6 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.set cimport set
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 3e59deaae4f..1fdc55dab41 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -5,8 +5,9 @@ from typing import Callable, Iterable, Optional
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
+
 from ._parser_internals.arc_eager cimport ArcEager
+from .transition_parser cimport Parser
 
 from ..language import Language
 from ..scorer import Scorer
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index c73ec5c528a..29f22b0174d 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -10,13 +10,14 @@ from ..training import remove_bilu_prefix, validate_examples
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
+
 from ._parser_internals.ner cimport BiluoPushDown
+from .transition_parser cimport Parser
+
 from ..language import Language
-from ..scorer import get_ner_prf, PRFScore
-from ..util import registry
+from ..scorer import PRFScore, get_ner_prf
 from ..training import remove_bilu_prefix
-
+from ..util import registry
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index f20e69a6e56..a48d76b6819 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -1,11 +1,11 @@
 from cymem.cymem cimport Pool
 from thinc.backends.cblas cimport CBlas
 
+from ..ml.parser_model cimport ActivationsC, SizesC, WeightsC
 from ..vocab cimport Vocab
-from .trainable_pipe cimport TrainablePipe
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
 from ._parser_internals._state cimport StateC
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
+from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from .trainable_pipe cimport TrainablePipe
 
 
 cdef class Parser(TrainablePipe):
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 5d5fbb02790..b6848d380fe 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,7 +17,6 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
-import logging
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 44717f6eba2..d2a41ff0fed 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,6 +1,7 @@
 import ctypes
 import os
 from pathlib import Path
+
 import pytest
 
 try:
@@ -8,15 +9,16 @@
 except ImportError:
     from pydantic import ValidationError  # type: ignore
 
-from spacy.about import __version__ as spacy_version
-from spacy import util
-from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.ml._precomputable_affine import PrecomputableAffine
-from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
-from spacy.util import dot_to_object, SimpleFrozenList, import_file
-from spacy.util import to_ternary_int, find_available_port
-from thinc.api import Config, Optimizer, ConfigValidationError
-from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
+from thinc.api import (
+    Config,
+    ConfigValidationError,
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+    set_current_ops,
+)
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util

From 31b2f7d3ff435a8d5b8b4bb1c935381578b24f45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:38:01 +0100
Subject: [PATCH 402/504] Add distillation tests with max cut size

And fix endless loop when the max cut size is 0 or 1.
---
 spacy/tests/parser/test_ner.py   | 5 ++++-
 spacy/tests/parser/test_parse.py | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index b6848d380fe..7c3a9d56249 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -624,7 +624,9 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -642,6 +644,7 @@ def test_distill():
 
     student = English()
     student_ner = student.add_pipe("ner")
+    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 42cf5ced998..dbede7edd52 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -402,7 +402,9 @@ def test_is_distillable():
     assert parser.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []
@@ -420,6 +422,7 @@ def test_distill():
 
     student = English()
     student_parser = student.add_pipe("parser")
+    student_parser.cfg["update_with_oracle_cut_size"] = max_moves
     student_parser.initialize(
         get_examples=lambda: train_examples, labels=teacher_parser.label_data
     )

From 70f337ceee872915e84643ed08e38e64c5d07e8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 18 Dec 2023 20:02:15 +0100
Subject: [PATCH 403/504] Fix Cython lints

---
 spacy/ml/parser_model.pxd                |  9 ++--
 spacy/ml/parser_model.pyx                | 64 ++++++++++++------------
 spacy/pipeline/_parser_internals/ner.pyx |  4 +-
 spacy/pipeline/dep_parser.pyx            |  1 +
 spacy/pipeline/ner.pyx                   |  3 +-
 spacy/pipeline/transition_parser.pxd     |  4 +-
 6 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
index ca31c169964..88386255147 100644
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@@ -41,10 +41,9 @@ cdef ActivationsC alloc_activations(SizesC n) nogil
 cdef void free_activations(const ActivationsC* A) nogil
 
 cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil
- 
+                         const WeightsC* W, SizesC n) nogil
+
 cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
 
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores, int O) nogil
- 
+cdef void cpu_log_loss(float* d_scores, const float* costs,
+                       const int* is_valid, const float* scores, int O) nogil
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 90e836f8a0a..843275f4c8b 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -13,7 +13,7 @@ from .. import util
 from ..errors import Errors
 
 from ..pipeline._parser_internals.stateclass cimport StateClass
-from ..typedefs cimport class_t, hash_t, weight_t
+from ..typedefs cimport weight_t
 
 
 cdef WeightsC get_c_weights(model) except *:
@@ -78,31 +78,31 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
         A._max_size = n.states
     else:
         A.token_ids = <int*>realloc(A.token_ids,
-            n.states * n.feats * sizeof(A.token_ids[0]))
+                                    n.states * n.feats * sizeof(A.token_ids[0]))
         A.scores = <float*>realloc(A.scores,
-            n.states * n.classes * sizeof(A.scores[0]))
+                                   n.states * n.classes * sizeof(A.scores[0]))
         A.unmaxed = <float*>realloc(A.unmaxed,
-            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
         A.hiddens = <float*>realloc(A.hiddens,
-            n.states * n.hiddens * sizeof(A.hiddens[0]))
+                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
         A.is_valid = <int*>realloc(A.is_valid,
-            n.states * n.classes * sizeof(A.is_valid[0]))
+                                   n.states * n.classes * sizeof(A.is_valid[0]))
         A._max_size = n.states
     A._curr_size = n.states
 
 
 cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil:
-    cdef double one = 1.0
+                         const WeightsC* W, SizesC n) nogil:
     resize_activations(A, n)
     for i in range(n.states):
         states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
     memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
     memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
-    sum_state_features(cblas, A.unmaxed,
-        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
+    sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n.states,
+                       n.feats, n.hiddens * n.pieces)
     for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1,
+                     &A.unmaxed[i*n.hiddens*n.pieces], 1)
         for j in range(n.hiddens):
             index = i * n.hiddens * n.pieces + j * n.pieces
             which = _arg_max(&A.unmaxed[index], n.pieces)
@@ -112,10 +112,10 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
         memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
     else:
         # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-            1.0, <const float *>A.hiddens, n.hiddens,
-            <const float *>W.hidden_weights, n.hiddens,
-            0.0, A.scores, n.classes)
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens, 1.0,
+                     <const float *>A.hiddens, n.hiddens,
+                     <const float *>W.hidden_weights, n.hiddens, 0.0,
+                     A.scores, n.classes)
         # Add bias
         for i in range(n.states):
             saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
@@ -131,9 +131,9 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
                 A.scores[i*n.classes+j] = min_
 
 
-cdef void sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, int B, int F, int O) nogil:
-    cdef int idx, b, f, i
+cdef void sum_state_features(CBlas cblas, float* output, const float* cached,
+                             const int* token_ids, int B, int F, int O) nogil:
+    cdef int idx, b, f
     cdef const float* feature
     padding = cached
     cached += F * O
@@ -150,9 +150,8 @@ cdef void sum_state_features(CBlas cblas, float* output,
         token_ids += F
 
 
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores,
-        int O) nogil:
+cdef void cpu_log_loss(float* d_scores, const float* costs, const int* is_valid,
+                       const float* scores, int O) nogil:
     """Do multi-label log loss"""
     cdef double max_, gmax, Z, gZ
     best = arg_max_if_gold(scores, costs, is_valid, O)
@@ -178,7 +177,7 @@ cdef void cpu_log_loss(float* d_scores,
 
 
 cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
-        const int* is_valid, int n) nogil:
+                         const int* is_valid, int n) nogil:
     # Find minimum cost
     cdef float cost = 1
     for i in range(n):
@@ -202,10 +201,9 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
     return best
 
 
-
 class ParserStepModel(Model):
     def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
-            dropout=0.1):
+                 dropout=0.1):
         Model.__init__(self, name="parser_step_model", forward=step_forward)
         self.attrs["has_upper"] = has_upper
         self.attrs["dropout_rate"] = dropout
@@ -267,7 +265,7 @@ class ParserStepModel(Model):
 
     def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
         if isinstance(self.state2vec.ops, CupyOps) \
-        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+           and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
             # Move token_ids and d_vector to GPU, asynchronously
             self.backprops.append((
                 util.get_async(self.cuda_stream, token_ids),
@@ -277,7 +275,6 @@ class ParserStepModel(Model):
         else:
             self.backprops.append((token_ids, d_vector, get_d_tokvecs))
 
-
     def finish_steps(self, golds):
         # Add a padding vector to the d_tokvecs gradient, so that missing
         # values don't affect the real gradient.
@@ -290,14 +287,15 @@ class ParserStepModel(Model):
             ids = ids.flatten()
             d_state_features = d_state_features.reshape(
                 (ids.size, d_state_features.shape[2]))
-            self.ops.scatter_add(d_tokvecs, ids,
-                d_state_features)
+            self.ops.scatter_add(d_tokvecs, ids, d_state_features)
         # Padded -- see update()
         self.bp_tokvecs(d_tokvecs[:-1])
         return d_tokvecs
 
+
 NUMPY_OPS = NumpyOps()
 
+
 def step_forward(model: ParserStepModel, states, is_train):
     token_ids = model.get_token_ids(states)
     vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
@@ -310,7 +308,7 @@ def step_forward(model: ParserStepModel, states, is_train):
         scores, get_d_vector = model.vec2scores(vector, is_train)
     else:
         scores = NumpyOps().asarray(vector)
-        get_d_vector = lambda d_scores: d_scores
+        def get_d_vector(d_scores): return d_scores
     # If the class is unseen, make sure its score is minimum
     scores[:, model._class_mask == 0] = numpy.nanmin(scores)
 
@@ -445,8 +443,8 @@ cdef class precompute_hiddens:
         feat_weights = self.get_feat_weights()
         cdef int[:, ::1] ids = token_ids
         sum_state_features(cblas, <float*>state_vector.data,
-            feat_weights, &ids[0,0],
-            token_ids.shape[0], self.nF, self.nO*self.nP)
+                           feat_weights, &ids[0, 0], token_ids.shape[0],
+                           self.nF, self.nO*self.nP)
         state_vector += self.bias
         state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
 
@@ -471,7 +469,7 @@ cdef class precompute_hiddens:
 
         def backprop_maxout(d_best):
             return self.ops.backprop_maxout(d_best, mask, self.nP)
-        
+
         return state_vector, backprop_maxout
 
     def _relu_nonlinearity(self, state_vector):
@@ -485,7 +483,7 @@ cdef class precompute_hiddens:
         def backprop_relu(d_best):
             d_best *= mask
             return d_best.reshape((d_best.shape + (1,)))
- 
+
         return state_vector, backprop_relu
 
 cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 375a9866ed2..324a497c9fb 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -166,7 +166,7 @@ cdef class BiluoPushDown(TransitionSystem):
             if token.ent_type:
                 labels.add(token.ent_type_)
         return labels
-    
+
     def move_name(self, int move, attr_t label):
         if move == OUT:
             return 'O'
@@ -661,7 +661,7 @@ cdef class Unit:
                 cost += 1
                 break
         return cost
- 
+
 
 
 cdef class Out:
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 1fdc55dab41..cbd7187ff0f 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -127,6 +127,7 @@ def make_parser(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_parser",
     assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 29f22b0174d..fe54d33a17b 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -15,7 +15,7 @@ from ._parser_internals.ner cimport BiluoPushDown
 from .transition_parser cimport Parser
 
 from ..language import Language
-from ..scorer import PRFScore, get_ner_prf
+from ..scorer import get_ner_prf
 from ..training import remove_bilu_prefix
 from ..util import registry
 
@@ -105,6 +105,7 @@ def make_ner(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_ner",
     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index a48d76b6819..7adb82213de 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -15,7 +15,7 @@ cdef class Parser(TrainablePipe):
     cdef object _cpu_ops
 
     cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil
+                      WeightsC weights, SizesC sizes) nogil
 
     cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil
+                                 int nr_class, int batch_size) nogil

From 53e9d8c4b456cdcd7116ebade2204d1d74513523 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 18 Dec 2023 20:17:24 +0100
Subject: [PATCH 404/504] Bring back W401

---
 spacy/errors.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index bf8804e4f36..c005265ff8d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -217,6 +217,11 @@ class Warnings(metaclass=ErrorsWithCodes):
     W126 = ("These keys are unsupported: {unsupported}")
     W127 = ("Not all `Language.pipe` worker processes completed successfully")
 
+    # v4 warning strings
+    W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
+            "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
+            "to return `True` in `.supports_prior_probs`.")
+
 
 class Errors(metaclass=ErrorsWithCodes):
     E001 = ("No component '{name}' found in pipeline. Available names: {opts}")

From 724593491b10246e6aa914019656dfc3ab90da4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 19 Dec 2023 09:28:20 +0100
Subject: [PATCH 405/504] Fix `TransitionBasedParser` version in transformer
 embeddings docs

---
 website/docs/usage/embeddings-transformers.mdx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 2bd2856b6a3..534cf478087 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -140,7 +140,7 @@ factory = "tok2vec"
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
@@ -156,7 +156,7 @@ same. This makes them fully independent and doesn't require an upstream
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 
 [components.ner.model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
@@ -472,7 +472,7 @@ sneakily delegates to the `Transformer` pipeline component.
 factory = "ner"
 
 [nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 128

From ec8b5a52125a2707b2ca941e84b2bfdbc8b21afc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 09:47:38 +0100
Subject: [PATCH 406/504] No need for `Literal` compat, since we only support
 >= 3.8

---
 spacy/compat.py           | 5 -----
 spacy/errors.py           | 1 -
 spacy/ml/models/parser.py | 3 +--
 3 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/spacy/compat.py b/spacy/compat.py
index 30459e2e495..1e63807a0e8 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,11 +23,6 @@
 except ImportError:
     cupy = None
 
-if sys.version_info[:2] >= (3, 8):  # Python 3.8+
-    from typing import Literal, Protocol, runtime_checkable
-else:
-    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
-
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/errors.py b/spacy/errors.py
index c005265ff8d..eab1d90c0f9 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1008,7 +1008,6 @@ class Errors(metaclass=ErrorsWithCodes):
     E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
-
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
 # fmt: on
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index f6c0e565dd3..e776174f6ed 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,9 +1,8 @@
-from typing import List, Optional, cast
+from typing import List, Literal, Optional
 
 from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
 from thinc.types import Floats2d
 
-from ...compat import Literal
 from ...errors import Errors
 from ...tokens import Doc
 from ...util import registry

From 07c18904d9fe2008124ccd03924d650e4b63005b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 10:06:28 +0100
Subject: [PATCH 407/504] Fix parser distillation test seed

The test would sometimes fail. Rather than increasing test by increasing
training iterations, use a known-good seed.
---
 spacy/tests/parser/test_parse.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index dbede7edd52..f63d56f6922 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -405,6 +405,7 @@ def test_is_distillable():
 @pytest.mark.slow
 @pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
 def test_distill(max_moves):
+    fix_random_seed(0)
     teacher = English()
     teacher_parser = teacher.add_pipe("parser")
     train_examples = []

From adcb060c7b0f33faa0ee14e87784e3bc5daa9c34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 11:14:35 +0100
Subject: [PATCH 408/504] TransitionBasedParser.v2 in run example output

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/docs/api/cli.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index edf6892a1d5..8f04e18c376 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -406,7 +406,7 @@ Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v2
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
@@ -416,7 +416,7 @@ Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
-Name       spacy.TransitionBasedParser.v1
+Name       spacy.TransitionBasedParser.v2
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]

From d6dcab9c4bb444c367d189cd688e3bba3c0f657c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 16 Jan 2024 14:54:26 +0100
Subject: [PATCH 409/504] Update thinc dependency to 9.0.0.dev4

---
 pyproject.toml                         |  2 +-
 requirements.txt                       |  2 +-
 setup.cfg                              | 10 +++++++++-
 spacy/pipeline/edit_tree_lemmatizer.py |  7 ++-----
 spacy/pipeline/morphologizer.pyx       |  3 ++-
 spacy/pipeline/senter.pyx              |  7 ++-----
 spacy/pipeline/tagger.pyx              |  9 +++++++--
 7 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3891d137867..0a5bc162773 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev2,<9.1.0",
+    "thinc>=9.0.0.dev4,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 29420430aab..e82f28055bb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev0,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev2,<9.1.0
+thinc>=9.0.0.dev4,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 3a84f37d3bf..223c63dd6da 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,6 +30,14 @@ project_urls =
 zip_safe = false
 include_package_data = true
 python_requires = >=3.8
+setup_requires =
+    cython>=0.25,<3.0
+    numpy>=1.15.0
+    # We also need our Cython packages here to compile against
+    cymem>=2.0.2,<2.1.0
+    preshed>=3.0.2,<3.1.0
+    murmurhash>=0.28.0,<1.1.0
+    thinc>=9.0.0.dev4,<9.1.0
 install_requires =
     # Our libraries
     spacy-legacy>=4.0.0.dev0,<4.1.0
@@ -37,7 +45,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev2,<9.1.0
+    thinc>=9.0.0.dev4,<9.1.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index a93a6c676c2..54c880a7d89 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -5,7 +5,6 @@
 import numpy as np
 import srsly
 from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
@@ -134,9 +133,7 @@ def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
     ) -> Tuple[float, List[Floats2d]]:
         validate_examples(examples, "EditTreeLemmatizer.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(
-            normalize=False, missing_value=-1
-        )
+        loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
 
         truths = []
         for eg in examples:
@@ -172,7 +169,7 @@ def get_teacher_student_loss(
 
         DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 5edd922019d..443b6818dc2 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -321,7 +321,8 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
+                                                    label_smoothing=self.cfg["label_smoothing"])
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 646166c329c..35627bbf2ad 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -2,10 +2,7 @@
 from itertools import islice
 from typing import Callable, Dict, Iterable, List, Optional, Union
 
-import srsly
-from thinc.api import Config, Model
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 
 from ..tokens.doc cimport Doc
 
@@ -183,7 +180,7 @@ class SentenceRecognizer(Tagger):
         """
         validate_examples(examples, "SentenceRecognizer.get_loss")
         labels = self.labels
-        loss_func = LegacySequenceCategoricalCrossentropy(names=labels, normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
         truths = []
         for eg in examples:
             eg_truth = []
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index a73461ee74a..ccd401b6af9 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -304,7 +304,7 @@ class Tagger(TrainablePipe):
 
         DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))
@@ -321,7 +321,12 @@ class Tagger(TrainablePipe):
         DOCS: https://spacy.io/api/tagger#get_loss
         """
         validate_examples(examples, "Tagger.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
+        loss_func = SequenceCategoricalCrossentropy(
+            names=self.labels,
+            normalize=False,
+            neg_prefix=self.cfg["neg_prefix"],
+            label_smoothing=self.cfg["label_smoothing"]
+        )
         # Convert empty tag "" to missing value None so that both misaligned
         # tokens and tokens with missing annotation have the default missing
         # value None.

From e1549e338fc5911bfae8a1d63a8c33639bbd2d35 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 10 Nov 2023 08:05:07 +0100
Subject: [PATCH 410/504] Warn about reloading dependencies after downloading
 models (#13081)

* Update the "Missing factory" error message

This accounts for model installations that took place during the current Python session.

* Add a note about Jupyter notebooks

* Move error to `spacy.cli.download`
Add extra message for Jupyter sessions

* Add additional note for interactive sessions

* Remove note about `spacy-transformers` from error message

* `isort`

* Improve checks for colab (also helps displacy)

* Update warning messages

* Improve flow for multiple checks

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/download.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0635522930b..5e460717cc4 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,10 +7,11 @@
 from wasabi import msg
 
 from .. import about
+from ..errors import OLD_MODEL_SHORTCUTS
 from ..util import (
-    get_installed_models,
     get_minor_version,
-    get_package_version,
+    is_in_interactive,
+    is_in_jupyter,
     is_package,
     is_prerelease_version,
     run_command,

From ff31022cf55c889eb2fc0c7de95df20321f3fd60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 29 Nov 2023 09:11:54 +0100
Subject: [PATCH 411/504] Update `TextCatBOW` to use the fixed `SparseLinear`
 layer (#13149)

* Update `TextCatBOW` to use the fixed `SparseLinear` layer

A while ago, we fixed the `SparseLinear` layer to use all available
parameters: https://github.com/explosion/thinc/pull/754

This change updates `TextCatBOW` to `v3` which uses the new
`SparseLinear_v2` layer. This results in a sizeable improvement on a
text categorization task that was tested.

While at it, this `spacy.TextCatBOW.v3` also adds the `length_exponent`
option to make it possible to change the hidden size. Ideally, we'd just
have an option called `length`. But the way that `TextCatBOW` uses
hashes results in a non-uniform distribution of parameters when the
length is not a power of two.

* Replace TexCatBOW `length_exponent` parameter by `length`

We now round up the length to the next power of two if it isn't
a power of two.

* Remove some tests for TextCatBOW.v2

* Fix missing import
---
 spacy/errors.py                      |  3 ---
 spacy/tests/pipeline/test_textcat.py |  8 +++---
 website/docs/api/architectures.mdx   | 40 ++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index eab1d90c0f9..302243b6ca5 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -976,9 +976,6 @@ class Errors(metaclass=ErrorsWithCodes):
     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
              "but only callbacks with one or three parameters are supported")
     E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
-    E1057 = ("The `TextCatReduce` architecture must be used with at least one "
-             "reduction. Please enable one of `use_reduce_first`, "
-             "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index f834597fafe..e26f3b01336 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -502,9 +502,9 @@ def test_resize(name, textcat_config):
         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
-        # REDUCE
-        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        # CNN
+        ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
     ],
 )
 # fmt: on
@@ -752,7 +752,7 @@ def test_overfitting_IO_multi():
         # ENSEMBLE V2
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
-        # CNN V2 (legacy)
+        # CNN V2
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
         # PARAMETRIC ATTENTION V1
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 956234ac0d4..31beb15644c 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -1020,6 +1020,46 @@ but used an internal `tok2vec` instead of taking it as argument:
 
 ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
 
+> #### Example Config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.TextCatCNN.v2"
+> exclusive_classes = false
+> nO = null
+>
+> [model.tok2vec]
+> @architectures = "spacy.HashEmbedCNN.v2"
+> pretrained_vectors = null
+> width = 96
+> depth = 4
+> embed_size = 2000
+> window_size = 1
+> maxout_pieces = 3
+> subword_features = true
+> ```
+
+A neural network model where token vectors are calculated using a CNN. The
+vectors are mean pooled and used as features in a feed-forward network. This
+architecture is usually less accurate than the ensemble, but runs faster.
+
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
+
+<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
+
+[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
+not yet resizable. Since v2, new labels can be added to this component, even
+after training.
+
+</Accordion>
+
+### spacy.TextCatBOW.v3 {id="TextCatBOW"}
+
 > #### Example Config
 >
 > ```ini

From e56232fd00897a09d66855327b6faa0285b7f4e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 11:00:06 +0100
Subject: [PATCH 412/504] Add TextCatReduce.v1 (#13181)

* Add TextCatReduce.v1

This is a textcat classifier that pools the vectors generated by a
tok2vec implementation and then applies a classifier to the pooled
representation. Three reductions are supported for pooling: first, max,
and mean. When multiple reductions are enabled, the reductions are
concatenated before providing them to the classification layer.

This model is a generalization of the TextCatCNN model, which only
supports mean reductions and is a bit of a misnomer, because it can also
be used with transformers. This change also reimplements TextCatCNN.v2
using the new TextCatReduce.v1 layer.

* Doc fixes

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fully specify `TextCatCNN` <-> `TextCatReduce` equivalence

* Move TextCatCNN docs to legacy, in prep for moving to spacy-legacy

* Add back a test for TextCatCNN.v2

* Replace TextCatCNN in pipe configurations and templates

* Add an infobox to the `TextCatReduce` section with an `TextCatCNN` anchor

* Add last reduction (`use_reduce_last`)

* Remove non-working TextCatCNN Netlify redirect

* Revert layer changes for the quickstart

* Revert one more quickstart change

* Remove unused import

* Fix docstring

* Fix setting name in error message

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/errors.py                      |  3 ++
 spacy/tests/pipeline/test_textcat.py | 11 ++--
 website/docs/api/architectures.mdx   | 78 ----------------------------
 3 files changed, 7 insertions(+), 85 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 302243b6ca5..eab1d90c0f9 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -976,6 +976,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
              "but only callbacks with one or three parameters are supported")
     E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
+    E1057 = ("The `TextCatReduce` architecture must be used with at least one "
+             "reduction. Please enable one of `use_reduce_first`, "
+             "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index e26f3b01336..ac32b1c10bf 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -502,9 +502,9 @@ def test_resize(name, textcat_config):
         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
-        # CNN
-        ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
+        # REDUCE
+        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
     ],
 )
 # fmt: on
@@ -752,12 +752,9 @@ def test_overfitting_IO_multi():
         # ENSEMBLE V2
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
-        # CNN V2
+        # CNN V2 (legacy)
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
-        # PARAMETRIC ATTENTION V1
-        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
-        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
         # REDUCE V1
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 31beb15644c..63f723a28cf 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -1020,46 +1020,6 @@ but used an internal `tok2vec` instead of taking it as argument:
 
 ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
 
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.TextCatCNN.v2"
-> exclusive_classes = false
-> nO = null
->
-> [model.tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v2"
-> pretrained_vectors = null
-> width = 96
-> depth = 4
-> embed_size = 2000
-> window_size = 1
-> maxout_pieces = 3
-> subword_features = true
-> ```
-
-A neural network model where token vectors are calculated using a CNN. The
-vectors are mean pooled and used as features in a feed-forward network. This
-architecture is usually less accurate than the ensemble, but runs faster.
-
-| Name                | Description                                                                                                                                                                                    |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
-
-<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
-
-[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
-not yet resizable. Since v2, new labels can be added to this component, even
-after training.
-
-</Accordion>
-
-### spacy.TextCatBOW.v3 {id="TextCatBOW"}
-
 > #### Example Config
 >
 > ```ini
@@ -1096,44 +1056,6 @@ the others, but may not be as accurate, especially if texts are short.
 
 </Accordion>
 
-### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"}
-
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.TextCatParametricAttention.v1"
-> exclusive_classes = true
-> nO = null
->
-> [model.tok2vec]
-> @architectures = "spacy.Tok2Vec.v2"
->
-> [model.tok2vec.embed]
-> @architectures = "spacy.MultiHashEmbed.v2"
-> width = 64
-> rows = [2000, 2000, 1000, 1000, 1000, 1000]
-> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
-> include_static_vectors = false
->
-> [model.tok2vec.encode]
-> @architectures = "spacy.MaxoutWindowEncoder.v2"
-> width = ${model.tok2vec.embed.width}
-> window_size = 1
-> maxout_pieces = 3
-> depth = 2
-> ```
-
-A neural network model that is built upon Tok2Vec and uses parametric attention
-to attend to tokens that are relevant to text classification.
-
-| Name                | Description                                                                                                                                                                                    |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`           | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
-
 ### spacy.TextCatReduce.v1 {id="TextCatReduce"}
 
 > #### Example Config

From d8386a10b133d5129bf28822b8320e2175f29445 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 2 Jan 2024 10:03:06 +0100
Subject: [PATCH 413/504] Add spacy.TextCatParametricAttention.v1 (#13201)

* Add spacy.TextCatParametricAttention.v1

This layer provides is a simplification of the ensemble classifier that
only uses paramteric attention. We have found empirically that with a
sufficient amount of training data, using the ensemble classifier with
BoW does not provide significant improvement in classifier accuracy.
However, plugging in a BoW classifier does reduce GPU training and
inference performance substantially, since it uses a GPU-only kernel.

* Fix merge fallout
---
 pyproject.toml                       |  5 ++--
 requirements.txt                     |  2 +-
 setup.cfg                            |  4 +--
 spacy/tests/pipeline/test_textcat.py |  3 +++
 website/docs/api/architectures.mdx   | 38 ++++++++++++++++++++++++++++
 5 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0a5bc162773..bfd7e68d1f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,8 +5,9 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev4,<9.1.0",
-    "numpy>=1.15.0",
+    "thinc>=8.2.2,<8.3.0",
+    "numpy>=1.15.0; python_version < '3.9'",
+    "numpy>=1.25.0; python_version >= '3.9'",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index e82f28055bb..e99ebc90ab2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev0,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev4,<9.1.0
+thinc>=8.2.2,<8.3.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 223c63dd6da..b57fdc52bf9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,7 +37,7 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=9.0.0.dev4,<9.1.0
+    thinc>=8.2.2,<8.3.0
 install_requires =
     # Our libraries
     spacy-legacy>=4.0.0.dev0,<4.1.0
@@ -45,7 +45,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev4,<9.1.0
+    thinc>=8.2.2,<8.3.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index ac32b1c10bf..f834597fafe 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -755,6 +755,9 @@ def test_overfitting_IO_multi():
         # CNN V2 (legacy)
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
+        # PARAMETRIC ATTENTION V1
+        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
+        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
         # REDUCE V1
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 63f723a28cf..956234ac0d4 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -1056,6 +1056,44 @@ the others, but may not be as accurate, especially if texts are short.
 
 </Accordion>
 
+### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"}
+
+> #### Example Config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.TextCatParametricAttention.v1"
+> exclusive_classes = true
+> nO = null
+>
+> [model.tok2vec]
+> @architectures = "spacy.Tok2Vec.v2"
+>
+> [model.tok2vec.embed]
+> @architectures = "spacy.MultiHashEmbed.v2"
+> width = 64
+> rows = [2000, 2000, 1000, 1000, 1000, 1000]
+> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+> include_static_vectors = false
+>
+> [model.tok2vec.encode]
+> @architectures = "spacy.MaxoutWindowEncoder.v2"
+> width = ${model.tok2vec.embed.width}
+> window_size = 1
+> maxout_pieces = 3
+> depth = 2
+> ```
+
+A neural network model that is built upon Tok2Vec and uses parametric attention
+to attend to tokens that are relevant to text classification.
+
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`           | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
+
 ### spacy.TextCatReduce.v1 {id="TextCatReduce"}
 
 > #### Example Config

From 6920ea947962ff31034266390d7feb62f99ca8b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 10:28:46 +0100
Subject: [PATCH 414/504] Typing fixes

---
 requirements.txt           | 2 +-
 spacy/tokens/span.pyi      | 2 ++
 spacy/training/example.pyx | 6 ++++++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index e99ebc90ab2..bee5535257f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<1.1.0; platform_machine != "aarch64"
+mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 3b93ffdaa0b..5039b33eee2 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -147,6 +147,8 @@ class Span:
     def lemma_(self) -> str: ...
     @property
     def label_(self) -> str: ...
+    @label_.setter
+    def label_(self, label: str): ...
     @property
     def kb_id_(self) -> str: ...
     @property
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 89a6a10b6cb..0b346b805a6 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -57,6 +57,12 @@ def validate_examples(examples, method):
 
 
 def validate_distillation_examples(examples, method):
+    """Check that a batch of examples received during processing is valid
+    for distillation.
+
+    examples (Iterable[Examples]): A batch of examples.
+    method (str): The method name to show in error messages.
+    """
     validate_examples(examples, method)
     for eg in examples:
         if [token.text for token in eg.reference] != [token.text for token in eg.predicted]:

From e23dfd757ecbd2362c23a5ebd0530152c1b60054 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 12:20:09 +0100
Subject: [PATCH 415/504] Py_UNICODE is not compatible with 3.12

---
 spacy/pipeline/_parser_internals/search.pyx |  2 +-
 spacy/tests/parser/_search.pyx              | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
index 578299b56ae..52d5cdaa891 100644
--- a/spacy/pipeline/_parser_internals/search.pyx
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -1,4 +1,4 @@
-# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
+# cython: experimental_cpp_class_def=True, cdivision=True, infer_types=True
 cimport cython
 from cymem.cymem cimport Pool
 from libc.math cimport exp
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
index cd9e6b2f5ee..ca2a2916094 100644
--- a/spacy/tests/parser/_search.pyx
+++ b/spacy/tests/parser/_search.pyx
@@ -12,7 +12,7 @@ from ..conftest import cytest
 cdef struct TestState:
     int length
     int x
-    Py_UNICODE* string
+    char *string
 
 
 cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1:
@@ -22,7 +22,7 @@ cdef int transition(void* dest, void* src, class_t clas, void* extra_args) excep
     dest_state.x = src_state.x
     dest_state.x += clas
     if extra_args != NULL:
-        dest_state.string = <Py_UNICODE*>extra_args
+        dest_state.string = <char *>extra_args
     else:
         dest_state.string = src_state.string
 
@@ -32,9 +32,9 @@ cdef void* initialize(Pool mem, int n, void* extra_args) except NULL:
     state.length = n
     state.x = 1
     if extra_args == NULL:
-        state.string = u'default'
+        state.string = 'default'
     else:
-        state.string = <Py_UNICODE*>extra_args
+        state.string = <char *>extra_args
     return state
 
 
@@ -77,7 +77,7 @@ def test_initialize(nr_class, beam_width, length):
     for i in range(b.width):
         s = <TestState*>b.at(i)
         assert s.length == length, s.length
-        assert s.string == 'default'
+        assert s.string.decode('utf8') == 'default'
 
 
 @cytest
@@ -88,11 +88,12 @@ def test_initialize(nr_class, beam_width, length):
                          ]
                          )
 def test_initialize_extra(nr_class, beam_width, length, extra):
+    extra = extra.encode("utf-8") if extra is not None else None
     b = Beam(nr_class, beam_width)
     if extra is None:
         b.initialize(initialize, destroy, length, NULL)
     else:
-        b.initialize(initialize, destroy, length, <void*><Py_UNICODE*>extra)
+        b.initialize(initialize, destroy, length, <void*><char*>extra)
     for i in range(b.width):
         s = <TestState*>b.at(i)
         assert s.length == length

From f2beaba25e936e921b5ef60265eed956cd748f80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 15:02:02 +0100
Subject: [PATCH 416/504] Remove `setup_requires` from `setup.cfg`

---
 setup.cfg | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index b57fdc52bf9..8dcaf79d278 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,14 +30,6 @@ project_urls =
 zip_safe = false
 include_package_data = true
 python_requires = >=3.8
-setup_requires =
-    cython>=0.25,<3.0
-    numpy>=1.15.0
-    # We also need our Cython packages here to compile against
-    cymem>=2.0.2,<2.1.0
-    preshed>=3.0.2,<3.1.0
-    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.2.2,<8.3.0
 install_requires =
     # Our libraries
     spacy-legacy>=4.0.0.dev0,<4.1.0

From e29bc6922f0012dd059feadf438aef6e3a15156a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 25 Jan 2024 12:54:23 +0100
Subject: [PATCH 417/504] Set version to v4.0.0.dev2 (#13269)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 73f201af5fb..ef80718fee0 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "4.0.0.dev1"
+__version__ = "4.0.0.dev2"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 117e455c92bb1db3faccefdf2de8a45a4aa2281e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 25 Jan 2024 18:24:22 +0100
Subject: [PATCH 418/504] Update `spacy-legacy` dependency to 4.0.0.dev1
 (#13270)

This release is compatible with the parser refactor backout.
---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index bee5535257f..4b58e75506d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=4.0.0.dev0,<4.1.0
+spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
diff --git a/setup.cfg b/setup.cfg
index 8dcaf79d278..55e6942622d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -32,7 +32,7 @@ include_package_data = true
 python_requires = >=3.8
 install_requires =
     # Our libraries
-    spacy-legacy>=4.0.0.dev0,<4.1.0
+    spacy-legacy>=4.0.0.dev1,<4.1.0
     spacy-loggers>=1.0.0,<2.0.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0

From 9628790dcf13e0fe6e768ae499c0c46015b89626 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 29 Apr 2024 10:18:07 +0200
Subject: [PATCH 419/504] Fix CI (#13469)

* Remove hardcoded architecture setting

* update classifiers to include Python 3.12
---
 .github/workflows/tests.yml               | 63 ++++++++++++-----------
 .github/workflows/universe_validation.yml |  3 +-
 setup.cfg                                 |  1 +
 3 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 760a79f2121..2a236b6bd3e 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -30,8 +30,7 @@ jobs:
       - name: Configure Python version
         uses: actions/setup-python@v4
         with:
-          python-version: "3.8"
-          architecture: x64
+          python-version: "3.7"
 
       - name: black
         run: |
@@ -60,9 +59,11 @@ jobs:
         os: [ubuntu-latest, windows-latest, macos-latest]
         python_version: ["3.12"]
         include:
+          - os: windows-latest
+            python_version: "3.7"
           - os: macos-latest
             python_version: "3.8"
-          - os: ubuntu-20.04
+          - os: ubuntu-latest
             python_version: "3.9"
           - os: windows-latest
             python_version: "3.10"
@@ -79,7 +80,6 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python_version }}
-          architecture: x64
 
       - name: Install dependencies
         run: |
@@ -93,6 +93,7 @@ jobs:
       - name: Run mypy
         run: |
           python -m mypy spacy
+        if: matrix.python_version != '3.7'
 
       - name: Delete source directory and .egg-info
         run: |
@@ -114,22 +115,22 @@ jobs:
       - name: Test import
         run: python -W error -c "import spacy"
 
-      #      - name: "Test download CLI"
-      #        run: |
-      #          python -m spacy download ca_core_news_sm
-      #          python -m spacy download ca_core_news_md
-      #          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-      #        if: matrix.python_version == '3.9'
-      #
-      #      - name: "Test download_url in info CLI"
-      #        run: |
-      #          python -W error -m spacy info ca_core_news_sm | grep -q download_url
-      #        if: matrix.python_version == '3.9'
-      #
-      #      - name: "Test no warnings on load (#11713)"
-      #        run: |
-      #          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-      #        if: matrix.python_version == '3.9'
+      - name: "Test download CLI"
+        run: |
+          python -m spacy download ca_core_news_sm
+          python -m spacy download ca_core_news_md
+          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+        if: matrix.python_version == '3.9'
+
+      - name: "Test download_url in info CLI"
+        run: |
+          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+        if: matrix.python_version == '3.9'
+
+      - name: "Test no warnings on load (#11713)"
+        run: |
+          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+        if: matrix.python_version == '3.9'
 
       - name: "Test convert CLI"
         run: |
@@ -153,17 +154,17 @@ jobs:
           python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
         if: matrix.python_version == '3.9'
 
-      #      - name: "Test assemble CLI"
-      #        run: |
-      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      #          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-      #        if: matrix.python_version == '3.9'
-      #
-      #      - name: "Test assemble CLI vectors warning"
-      #        run: |
-      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-      #          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-      #        if: matrix.python_version == '3.9'
+      - name: "Test assemble CLI"
+        run: |
+          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+        if: matrix.python_version == '3.9'
+
+      - name: "Test assemble CLI vectors warning"
+        run: |
+          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+        if: matrix.python_version == '3.9'
 
       - name: "Install test requirements"
         run: |
diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml
index c5e68784e00..4d492500c57 100644
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@@ -25,8 +25,7 @@ jobs:
       - name: Configure Python version
         uses: actions/setup-python@v4
         with:
-          python-version: "3.8"
-          architecture: x64
+          python-version: "3.7"
 
       - name: Validate website/meta/universe.json
         run: |
diff --git a/setup.cfg b/setup.cfg
index 55e6942622d..a9459c84da7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -21,6 +21,7 @@ classifiers =
     Programming Language :: Python :: 3.9
     Programming Language :: Python :: 3.10
     Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3.12
     Topic :: Scientific/Engineering
 project_urls =
     Release notes = https://github.com/explosion/spaCy/releases

From 8c75003d7d67c32d3a046f9cfc1d8cb6341245c0 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 29 Apr 2024 10:36:31 +0200
Subject: [PATCH 420/504] Bump to v5 (#13470)

---
 .github/workflows/lock.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml
index 6c3985a930a..2bbdd64c771 100644
--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@@ -16,7 +16,7 @@ jobs:
     if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:
-      - uses: dessant/lock-threads@v4
+      - uses: dessant/lock-threads@v5
         with:
           process-only: 'issues'
           issue-inactive-days: '30'

From bce9b0244311f7e62ae856ff54d71ea08c9bfe3d Mon Sep 17 00:00:00 2001
From: Alex Strick van Linschoten <strickvl@users.noreply.github.com>
Date: Mon, 29 Apr 2024 11:10:17 +0200
Subject: [PATCH 421/504] Fix typos in docs (#13466)

* fix typos

* prettier formatting

---------

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/cli/find_threshold.py                 |   4 +-
 spacy/tests/test_language.py                |   2 +-
 website/docs/api/attributes.mdx             |  60 ++--
 website/docs/api/cli.mdx                    |   4 +-
 website/docs/api/entitylinker.mdx           |  65 ++--
 website/docs/api/entityruler.mdx            | 315 ++++++++++++++++----
 website/docs/api/span.mdx                   |   2 +-
 website/docs/api/transformer.mdx            |   2 +-
 website/docs/usage/layers-architectures.mdx |   2 +-
 website/docs/usage/linguistic-features.mdx  |   2 +-
 website/docs/usage/projects.mdx             |   4 +-
 website/docs/usage/saving-loading.mdx       |  11 +-
 website/docs/usage/v2-2.mdx                 |   2 +-
 website/docs/usage/v3-2.mdx                 |   2 +-
 14 files changed, 349 insertions(+), 128 deletions(-)

diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py
index 48077fa511d..3e86495e7c1 100644
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@@ -39,7 +39,7 @@ def find_threshold_cli(
     # fmt: on
 ):
     """
-    Runs prediction trials for a trained model with varying tresholds to maximize
+    Runs prediction trials for a trained model with varying thresholds to maximize
     the specified metric. The search space for the threshold is traversed linearly
     from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
     (the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
@@ -81,7 +81,7 @@ def find_threshold(
     silent: bool = True,
 ) -> Tuple[float, float, Dict[float, float]]:
     """
-    Runs prediction trials for models with varying tresholds to maximize the specified metric.
+    Runs prediction trials for models with varying thresholds to maximize the specified metric.
     model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
     data_path (Path): Path to file with DocBin with docs to use for threshold search.
     pipe_name (str): Name of pipe to examine thresholds for.
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 25352d2bb16..6ed0f44eab9 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -424,7 +424,7 @@ def test_language_pipe_error_handler(n_process):
         nlp.set_error_handler(raise_error)
         with pytest.raises(ValueError):
             list(nlp.pipe(texts, n_process=n_process))
-        # set explicitely to ignoring
+        # set explicitly to ignoring
         nlp.set_error_handler(ignore_error)
         docs = list(nlp.pipe(texts, n_process=n_process))
         assert len(docs) == 0
diff --git a/website/docs/api/attributes.mdx b/website/docs/api/attributes.mdx
index 3142b741d9a..9cb76ac5842 100644
--- a/website/docs/api/attributes.mdx
+++ b/website/docs/api/attributes.mdx
@@ -45,33 +45,33 @@ For attributes that represent string values, the internal integer ID is accessed
 as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by
 appending `_` as in `token.dep_`.
 
-| Attribute    | Description                                                                                                                                                   |
-| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `DEP`        | The token's dependency label. ~~str~~                                                                                                                         |
-| `ENT_ID`     | The token's entity ID (`ent_id`). ~~str~~                                                                                                                     |
-| `ENT_IOB`    | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
-| `ENT_KB_ID`  | The token's entity knowledge base ID. ~~str~~                                                                                                                 |
-| `ENT_TYPE`   | The token's entity label. ~~str~~                                                                                                                             |
-| `IS_ALPHA`   | Token text consists of alphabetic characters. ~~bool~~                                                                                                        |
-| `IS_ASCII`   | Token text consists of ASCII characters. ~~bool~~                                                                                                             |
-| `IS_DIGIT`   | Token text consists of digits. ~~bool~~                                                                                                                       |
-| `IS_LOWER`   | Token text is in lowercase. ~~bool~~                                                                                                                          |
-| `IS_PUNCT`   | Token is punctuation. ~~bool~~                                                                                                                                |
-| `IS_SPACE`   | Token is whitespace. ~~bool~~                                                                                                                                 |
-| `IS_STOP`    | Token is a stop word. ~~bool~~                                                                                                                                |
-| `IS_TITLE`   | Token text is in titlecase. ~~bool~~                                                                                                                          |
-| `IS_UPPER`   | Token text is in uppercase. ~~bool~~                                                                                                                          |
-| `LEMMA`      | The token's lemma. ~~str~~                                                                                                                                    |
-| `LENGTH`     | The length of the token text. ~~int~~                                                                                                                         |
-| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~                                                                                                               |
-| `LIKE_NUM`   | Token text resembles a number. ~~bool~~                                                                                                                       |
-| `LIKE_URL`   | Token text resembles a URL. ~~bool~~                                                                                                                          |
-| `LOWER`      | The lowercase form of the token text. ~~str~~                                                                                                                 |
-| `MORPH`      | The token's morphological analysis. ~~MorphAnalysis~~                                                                                                         |
-| `NORM`       | The normalized form of the token text. ~~str~~                                                                                                                |
-| `ORTH`       | The exact verbatim text of a token. ~~str~~                                                                                                                   |
-| `POS`        | The token's universal part of speech (UPOS). ~~str~~                                                                                                          |
-| `SENT_START` | Token is start of sentence. ~~bool~~                                                                                                                          |
-| `SHAPE`      | The token's shape. ~~str~~                                                                                                                                    |
-| `SPACY`      | Token has a trailing space. ~~bool~~                                                                                                                          |
-| `TAG`        | The token's fine-grained part of speech. ~~str~~                                                                                                              |
+| Attribute    | Description                                                                                                                                                    |
+| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `DEP`        | The token's dependency label. ~~str~~                                                                                                                          |
+| `ENT_ID`     | The token's entity ID (`ent_id`). ~~str~~                                                                                                                      |
+| `ENT_IOB`    | The IOB part of the token's entity tag. Uses custom integer values rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
+| `ENT_KB_ID`  | The token's entity knowledge base ID. ~~str~~                                                                                                                  |
+| `ENT_TYPE`   | The token's entity label. ~~str~~                                                                                                                              |
+| `IS_ALPHA`   | Token text consists of alphabetic characters. ~~bool~~                                                                                                         |
+| `IS_ASCII`   | Token text consists of ASCII characters. ~~bool~~                                                                                                              |
+| `IS_DIGIT`   | Token text consists of digits. ~~bool~~                                                                                                                        |
+| `IS_LOWER`   | Token text is in lowercase. ~~bool~~                                                                                                                           |
+| `IS_PUNCT`   | Token is punctuation. ~~bool~~                                                                                                                                 |
+| `IS_SPACE`   | Token is whitespace. ~~bool~~                                                                                                                                  |
+| `IS_STOP`    | Token is a stop word. ~~bool~~                                                                                                                                 |
+| `IS_TITLE`   | Token text is in titlecase. ~~bool~~                                                                                                                           |
+| `IS_UPPER`   | Token text is in uppercase. ~~bool~~                                                                                                                           |
+| `LEMMA`      | The token's lemma. ~~str~~                                                                                                                                     |
+| `LENGTH`     | The length of the token text. ~~int~~                                                                                                                          |
+| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~                                                                                                                |
+| `LIKE_NUM`   | Token text resembles a number. ~~bool~~                                                                                                                        |
+| `LIKE_URL`   | Token text resembles a URL. ~~bool~~                                                                                                                           |
+| `LOWER`      | The lowercase form of the token text. ~~str~~                                                                                                                  |
+| `MORPH`      | The token's morphological analysis. ~~MorphAnalysis~~                                                                                                          |
+| `NORM`       | The normalized form of the token text. ~~str~~                                                                                                                 |
+| `ORTH`       | The exact verbatim text of a token. ~~str~~                                                                                                                    |
+| `POS`        | The token's universal part of speech (UPOS). ~~str~~                                                                                                           |
+| `SENT_START` | Token is start of sentence. ~~bool~~                                                                                                                           |
+| `SHAPE`      | The token's shape. ~~str~~                                                                                                                                     |
+| `SPACY`      | Token has a trailing space. ~~bool~~                                                                                                                           |
+| `TAG`        | The token's fine-grained part of speech. ~~str~~                                                                                                               |
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 8f04e18c376..480ce34e599 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -565,7 +565,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
 (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
 ✔ Good amount of examples for all labels
-✔ Examples without occurences available for all labels
+✔ Examples without occurrences available for all labels
 ✔ No entities consisting of or starting/ending with whitespace
 
 =========================== Part-of-speech Tagging ===========================
@@ -1318,7 +1318,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
 
 ## find-threshold {id="find-threshold",version="3.5",tag="command"}
 
-Runs prediction trials for a trained model with varying tresholds to maximize
+Runs prediction trials for a trained model with varying thresholds to maximize
 the specified metric. The search space for the threshold is traversed linearly
 from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
 (the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index 12b2f6bef1d..f4b83d88bbf 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -53,20 +53,21 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("entity_linker", config=config)
 > ```
 
-| Setting                                         | Description                                                                                                                                                                                                                                                                                 |
-| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels_discard`                                | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                              |
-| `n_sents`                                       | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                           |
-| `incl_prior`                                    | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                        |
-| `incl_context`                                  | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                      |
-| `model`                                         | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                      |
-| `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
-| `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
-| `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                   |
-| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
-| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
-| `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
+| Setting                                             | Description                                                                                                                                                                                                                                                                                                      |
+| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `labels_discard`                                    | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                                                   |
+| `n_sents`                                           | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                                                |
+| `incl_prior`                                        | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                             |
+| `incl_context`                                      | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                           |
+| `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           |
+| `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    |
+| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~                                                                                                                            |
+| `get_candidates`                                    | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                         |
+| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
+| `generate_empty_kb` <Tag variant="new">3.5.1</Tag>  | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           |
+| `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         |
+| `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          |
+| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                     |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
@@ -99,21 +100,21 @@ custom knowledge base, you should either call
 [`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the
 [`initialize`](/api/entitylinker#initialize) call.
 
-| Name                                     | Description                                                                                                                                                                                                                                                                                 |
-| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`                                  | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                                                            |
-| `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~                                                                                                                                                                                                   |
-| `name`                                   | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                                                         |
-| _keyword-only_                           |                                                                                                                                                                                                                                                                                             |
-| `entity_vector_length`                   | Size of encoding vectors in the KB. ~~int~~                                                                                                                                                                                                                                                 |
-| `get_candidates`                         | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                                                                                                                                            |
-| `labels_discard`                         | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~                                                                                                                                                                                                              |
-| `n_sents`                                | The number of neighbouring sentences to take into account. ~~int~~                                                                                                                                                                                                                          |
-| `incl_prior`                             | Whether or not to include prior probabilities from the KB in the model. ~~bool~~                                                                                                                                                                                                            |
-| `incl_context`                           | Whether or not to include the local context in the model. ~~bool~~                                                                                                                                                                                                                          |
-| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                    |
-| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
-| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
+| Name                                     | Description                                                                                                                                                                                                                                                                                  |
+| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                  | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                                                             |
+| `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~                                                                                                                                                                                                    |
+| `name`                                   | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                                                          |
+| _keyword-only_                           |                                                                                                                                                                                                                                                                                              |
+| `entity_vector_length`                   | Size of encoding vectors in the KB. ~~int~~                                                                                                                                                                                                                                                  |
+| `get_candidates`                         | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                                                                                                                                             |
+| `labels_discard`                         | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~                                                                                                                                                                                                               |
+| `n_sents`                                | The number of neighbouring sentences to take into account. ~~int~~                                                                                                                                                                                                                           |
+| `incl_prior`                             | Whether or not to include prior probabilities from the KB in the model. ~~bool~~                                                                                                                                                                                                             |
+| `incl_context`                           | Whether or not to include the local context in the model. ~~bool~~                                                                                                                                                                                                                           |
+| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                     |
+| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                      |
+| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
 
 ## EntityLinker.\_\_call\_\_ {id="call",tag="method"}
 
@@ -200,6 +201,12 @@ knowledge base. This argument should be a function that takes a `Vocab` instance
 and creates the `KnowledgeBase`, ensuring that the strings of the knowledge base
 are synced with the current vocab.
 
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx
index 7976e7725e0..335e87676c7 100644
--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@@ -1,24 +1,13 @@
 ---
 title: EntityRuler
-new: 2.1
+tag: class
+source: spacy/pipeline/entityruler.py
+version: 2.1
 teaser: 'Pipeline component for rule-based named entity recognition'
 api_string_name: entity_ruler
 api_trainable: false
 ---
 
-<Infobox title="New in v4" variant="warning">
-
-As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is
-implemented as a special case of the `SpanRuler` component.
-
-See the [migration guide](#migrating) below for differences between the v3
-`EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler`
-component.
-
-See the [`SpanRuler`](/api/spanruler) API docs for the full API.
-
-</Infobox>
-
 The entity ruler lets you add spans to the [`Doc.ents`](/api/doc#ents) using
 token-based rules or exact phrase matches. It can be combined with the
 statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or
@@ -69,57 +58,279 @@ how the component should be configured. You can override its settings via the
 | Setting                                              | Description                                                                                                                                                                                   |
 | ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `phrase_matcher_attr`                                | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
-| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~                                                             |
+| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~                                                   |
 | `validate`                                           | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~                                                                             |
 | `overwrite_ents`                                     | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~                                                     |
 | `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                       |
 | `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                 |
 
-## Migrating from v3 {#migrating}
+```python
+%%GITHUB_SPACY/spacy/pipeline/entityruler.py
+```
 
-### Loading patterns
+## EntityRuler.\_\_init\_\_ {id="init",tag="method"}
 
-Unlike the v3 `EntityRuler`, the `SpanRuler` cannot load patterns on
-initialization with `SpanRuler(patterns=patterns)` or directly from a JSONL file
-path with `SpanRuler.from_disk(jsonl_path)`. Patterns should be loaded from the
-JSONL file separately and then added through
-[`SpanRuler.initialize`](/api/spanruler#initialize]) or
-[`SpanRuler.add_patterns`](/api/spanruler#add_patterns).
+Initialize the entity ruler. If patterns are supplied here, they need to be a
+list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either
+be a token pattern (list) or a phrase pattern (string). For example:
+`{"label": "ORG", "pattern": "Apple"}`.
 
-```diff
- ruler = nlp.get_pipe("entity_ruler")
-- ruler.from_disk("patterns.jsonl")
-+ import srsly
-+ patterns = srsly.read_jsonl("patterns.jsonl")
-+ ruler.add_patterns(patterns)
-```
+> #### Example
+>
+> ```python
+> # Construction via add_pipe
+> ruler = nlp.add_pipe("entity_ruler")
+>
+> # Construction from class
+> from spacy.pipeline import EntityRuler
+> ruler = EntityRuler(nlp, overwrite_ents=True)
+> ```
 
-### Saving patterns
+| Name                                                 | Description                                                                                                                                                                                                                           |
+| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `nlp`                                                | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~                                                                                                                                     |
+| `name` <Tag variant="new">3</Tag>                    | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
+| _keyword-only_                                       |                                                                                                                                                                                                                                       |
+| `phrase_matcher_attr`                                | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~                                         |
+| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~                                                                                           |
+| `validate`                                           | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~                                                                                                                |
+| `overwrite_ents`                                     | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~                                                                                             |
+| `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                                                               |
+| `patterns`                                           | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~                                                                                                                                 |
+| `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                                                         |
 
-`SpanRuler.to_disk` always saves the full component data to a directory and does
-not include an option to save the patterns to a single JSONL file.
+## EntityRuler.initialize {id="initialize",tag="method",version="3"}
 
-```diff
- ruler = nlp.get_pipe("entity_ruler")
-- ruler.to_disk("patterns.jsonl")
-+ import srsly
-+ srsly.write_jsonl("patterns.jsonl", ruler.patterns)
-```
+Initialize the component with data and used before training to load in rules
+from a [pattern file](/usage/rule-based-matching/#entityruler-files). This
+method is typically called by [`Language.initialize`](/api/language#initialize)
+and lets you customize arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
 
-### Accessing token and phrase patterns
+> #### Example
+>
+> ```python
+> entity_ruler = nlp.add_pipe("entity_ruler")
+> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
+> ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.entity_ruler]
+>
+> [initialize.components.entity_ruler.patterns]
+> @readers = "srsly.read_jsonl.v1"
+> path = "corpus/entity_ruler_patterns.jsonl
+> ```
 
-The separate token patterns and phrase patterns are no longer accessible under
-`ruler.token_patterns` or `ruler.phrase_patterns`. You can access the combined
-patterns in their original format using the property
-[`SpanRuler.patterns`](/api/spanruler#patterns).
+| Name           | Description                                                                                                                                                          |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
+| _keyword-only_ |                                                                                                                                                                      |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                 |
+| `patterns`     | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~                                                        |
 
-### Removing patterns by ID
+## EntityRuler.\_\_len\_\_ {id="len",tag="method"}
 
-[`SpanRuler.remove`](/api/spanruler#remove) removes by label rather than ID. To
-remove by ID, use [`SpanRuler.remove_by_id`](/api/spanruler#remove_by_id):
+The number of all patterns added to the entity ruler.
 
-```diff
- ruler = nlp.get_pipe("entity_ruler")
-- ruler.remove("id")
-+ ruler.remove_by_id("id")
-```
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> assert len(ruler) == 0
+> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
+> assert len(ruler) == 1
+> ```
+
+| Name        | Description                     |
+| ----------- | ------------------------------- |
+| **RETURNS** | The number of patterns. ~~int~~ |
+
+## EntityRuler.\_\_contains\_\_ {id="contains",tag="method"}
+
+Whether a label is present in the patterns.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
+> assert "ORG" in ruler
+> assert not "PERSON" in ruler
+> ```
+
+| Name        | Description                                           |
+| ----------- | ----------------------------------------------------- |
+| `label`     | The label to check. ~~str~~                           |
+| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ |
+
+## EntityRuler.\_\_call\_\_ {id="call",tag="method"}
+
+Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
+happens automatically after the component has been added to the pipeline using
+[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
+with `overwrite_ents=True`, existing entities will be replaced if they overlap
+with the matches. When matches overlap in a Doc, the entity ruler prioritizes
+longer patterns over shorter, and if equal the match occurring first in the Doc
+is chosen.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
+>
+> doc = nlp("A text about Apple.")
+> ents = [(ent.text, ent.label_) for ent in doc.ents]
+> assert ents == [("Apple", "ORG")]
+> ```
+
+| Name        | Description                                                          |
+| ----------- | -------------------------------------------------------------------- |
+| `doc`       | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
+| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~        |
+
+## EntityRuler.add_patterns {id="add_patterns",tag="method"}
+
+Add patterns to the entity ruler. A pattern can either be a token pattern (list
+of dicts) or a phrase pattern (string). For more details, see the usage guide on
+[rule-based matching](/usage/rule-based-matching).
+
+> #### Example
+>
+> ```python
+> patterns = [
+>     {"label": "ORG", "pattern": "Apple"},
+>     {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}
+> ]
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns(patterns)
+> ```
+
+| Name       | Description                                                      |
+| ---------- | ---------------------------------------------------------------- |
+| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
+
+## EntityRuler.remove {id="remove",tag="method",version="3.2.1"}
+
+Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if
+the ID does not exist.
+
+> #### Example
+>
+> ```python
+> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}]
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns(patterns)
+> ruler.remove("apple")
+> ```
+
+| Name | Description                         |
+| ---- | ----------------------------------- |
+| `id` | The ID of the pattern rule. ~~str~~ |
+
+## EntityRuler.to_disk {id="to_disk",tag="method"}
+
+Save the entity ruler patterns to a directory. The patterns will be saved as
+newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided,
+only the patterns are saved as JSONL. If a directory name is provided, a
+`patterns.jsonl` and `cfg` file with the component configuration is exported.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.to_disk("/path/to/patterns.jsonl")  # saves patterns only
+> ruler.to_disk("/path/to/entity_ruler")    # saves patterns and config
+> ```
+
+| Name   | Description                                                                                                                                              |
+| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+
+## EntityRuler.from_disk {id="from_disk",tag="method"}
+
+Load the entity ruler from a path. Expects either a file containing
+newline-delimited JSON (JSONL) with one entry per line, or a directory
+containing a `patterns.jsonl` file and a `cfg` file with the component
+configuration.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.from_disk("/path/to/patterns.jsonl")  # loads patterns only
+> ruler.from_disk("/path/to/entity_ruler")    # loads patterns and config
+> ```
+
+| Name        | Description                                                                                                   |
+| ----------- | ------------------------------------------------------------------------------------------------------------- |
+| `path`      | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~                                                            |
+
+## EntityRuler.to_bytes {id="to_bytes",tag="method"}
+
+Serialize the entity ruler patterns to a bytestring.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler_bytes = ruler.to_bytes()
+> ```
+
+| Name        | Description                        |
+| ----------- | ---------------------------------- |
+| **RETURNS** | The serialized patterns. ~~bytes~~ |
+
+## EntityRuler.from_bytes {id="from_bytes",tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> ruler_bytes = ruler.to_bytes()
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.from_bytes(ruler_bytes)
+> ```
+
+| Name         | Description                                        |
+| ------------ | -------------------------------------------------- |
+| `bytes_data` | The bytestring to load. ~~bytes~~                  |
+| **RETURNS**  | The modified `EntityRuler` object. ~~EntityRuler~~ |
+
+## EntityRuler.labels {id="labels",tag="property"}
+
+All labels present in the match patterns.
+
+| Name        | Description                            |
+| ----------- | -------------------------------------- |
+| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ |
+
+## EntityRuler.ent_ids {id="ent_ids",tag="property",version="2.2.2"}
+
+All entity IDs present in the `id` properties of the match patterns.
+
+| Name        | Description                         |
+| ----------- | ----------------------------------- |
+| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ |
+
+## EntityRuler.patterns {id="patterns",tag="property"}
+
+Get all patterns that were added to the entity ruler.
+
+| Name        | Description                                                                              |
+| ----------- | ---------------------------------------------------------------------------------------- |
+| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ |
+
+## Attributes {id="attributes"}
+
+| Name              | Description                                                                                                           |
+| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
+| `matcher`         | The underlying matcher used to process token patterns. ~~Matcher~~                                                    |
+| `phrase_matcher`  | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~                                      |
+| `token_patterns`  | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
+| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~                             |
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index cd70d8dcead..5d1b56daebb 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -287,7 +287,7 @@ does not permit other NPs to be nested within it – so no NP-level coordination
 no prepositional phrases, and no relative clauses.
 
 If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
-has not been implemeted for the given language, a `NotImplementedError` is
+has not been implemented for the given language, a `NotImplementedError` is
 raised.
 
 > #### Example
diff --git a/website/docs/api/transformer.mdx b/website/docs/api/transformer.mdx
index 8f024553dac..9dcafb55782 100644
--- a/website/docs/api/transformer.mdx
+++ b/website/docs/api/transformer.mdx
@@ -416,7 +416,7 @@ by this class. Instances of this class are typically assigned to the
 | `align`        | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~                                                                                 |
 | `width`        | The width of the last hidden layer. ~~int~~                                                                                                                                                                                                                                                                                          |
 
-### TransformerData.empty {id="transformerdata-emoty",tag="classmethod"}
+### TransformerData.empty {id="transformerdata-empty",tag="classmethod"}
 
 Create an empty `TransformerData` container.
 
diff --git a/website/docs/usage/layers-architectures.mdx b/website/docs/usage/layers-architectures.mdx
index 03b85f5af91..344c66e8db2 100644
--- a/website/docs/usage/layers-architectures.mdx
+++ b/website/docs/usage/layers-architectures.mdx
@@ -832,7 +832,7 @@ retrieve and add to them.
 
 After creation, the component needs to be
 [initialized](/usage/training#initialization). This method can define the
-relevant labels in two ways: explicitely by setting the `labels` argument in the
+relevant labels in two ways: explicitly by setting the `labels` argument in the
 [`initialize` block](/api/data-formats#config-initialize) of the config, or
 implicately by deducing them from the `get_examples` callback that generates the
 full **training data set**, or a representative sample.
diff --git a/website/docs/usage/linguistic-features.mdx b/website/docs/usage/linguistic-features.mdx
index 26d1ad37962..57b95ee7b4f 100644
--- a/website/docs/usage/linguistic-features.mdx
+++ b/website/docs/usage/linguistic-features.mdx
@@ -1899,7 +1899,7 @@ the two words.
     "Shore": ("coast", 0.732257),
     "Precautionary": ("caution", 0.490973),
     "hopelessness": ("sadness", 0.742366),
-    "Continous": ("continuous", 0.732549),
+    "Continuous": ("continuous", 0.732549),
     "Disemboweled": ("corpse", 0.499432),
     "biostatistician": ("scientist", 0.339724),
     "somewheres": ("somewheres", 0.402736),
diff --git a/website/docs/usage/projects.mdx b/website/docs/usage/projects.mdx
index b089a7ab561..e10ba4c506c 100644
--- a/website/docs/usage/projects.mdx
+++ b/website/docs/usage/projects.mdx
@@ -173,7 +173,7 @@ detected, a corresponding warning is displayed. If you'd like to disable the
 dependency check, set `check_requirements: false` in your project's
 `project.yml`.
 
-### 4. Run a workflow {id="run-workfow"}
+### 4. Run a workflow {id="run-workflow"}
 
 > #### project.yml
 >
@@ -286,7 +286,7 @@ pipelines.
 | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `title`                                             | An optional project title used in `--help` message and [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                  |
 | `description`                                       | An optional project description used in [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| `vars`                                              | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`.                                                                                                                       |
+| `vars`                                              | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`.                                                                                                                      |
 | `env`                                               | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`.                                                                                                                                                                                                                                                                                          |
 | `directories`                                       | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist.                                                                                                                                                                                                                                                                                                                 |
 | `assets`                                            | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo.                                                                        |
diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx
index 97ae3c5e573..3712fbeeb80 100644
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@@ -306,7 +306,9 @@ installed in the same environment – that's it.
 
 ### Loading probability tables into existing models
 
-You can load a probability table from [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an existing spaCy model like `en_core_web_sm`.
+You can load a probability table from
+[spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an
+existing spaCy model like `en_core_web_sm`.
 
 ```python
 # Requirements: pip install spacy-lookups-data
@@ -317,7 +319,8 @@ lookups = load_lookups("en", ["lexeme_prob"])
 nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
 ```
 
-When training a model from scratch you can also specify probability tables in the `config.cfg`.
+When training a model from scratch you can also specify probability tables in
+the `config.cfg`.
 
 ```ini {title="config.cfg (excerpt)"}
 [initialize.lookups]
@@ -346,8 +349,8 @@ them**!
 To stick with the theme of
 [this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
 consider the following custom spaCy
-[pipeline component](/usage/processing-pipelines#custom-coponents) that prints a
-snake when it's called:
+[pipeline component](/usage/processing-pipelines#custom-components) that prints
+a snake when it's called:
 
 > #### Package directory structure
 >
diff --git a/website/docs/usage/v2-2.mdx b/website/docs/usage/v2-2.mdx
index 84129657dda..cf4f7c5bf57 100644
--- a/website/docs/usage/v2-2.mdx
+++ b/website/docs/usage/v2-2.mdx
@@ -185,7 +185,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
 (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
 ✔ Good amount of examples for all labels
-✔ Examples without occurences available for all labels
+✔ Examples without occurrences available for all labels
 ✔ No entities consisting of or starting/ending with whitespace
 
 =========================== Part-of-speech Tagging ===========================
diff --git a/website/docs/usage/v3-2.mdx b/website/docs/usage/v3-2.mdx
index b4a4ef67242..b3ffd5d6820 100644
--- a/website/docs/usage/v3-2.mdx
+++ b/website/docs/usage/v3-2.mdx
@@ -138,7 +138,7 @@ backwards compatibility, the tuple format remains available under
 `TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
 in the [transformer API docs](/api/architectures#TransformerModel).
 
-`spacy-transfomers` v1.1 also adds support for `transformer_config` settings
+`spacy-transformers` v1.1 also adds support for `transformer_config` settings
 such as `output_attentions`. Additional output is stored under
 `TransformerData.model_output`. More details are in the
 [TransformerModel docs](/api/architectures#TransformerModel). The training speed

From 980f7467cd16bdffdaad9740e01b939ad4f376fb Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 29 Apr 2024 13:28:46 +0200
Subject: [PATCH 422/504] Update Typer pin and GH actions (#13471)

* update gh actions

* pin typer upperbound to 1.0.0
---
 .github/workflows/explosionbot.yml         | 2 +-
 .github/workflows/slowtests.yml            | 2 +-
 .github/workflows/spacy_universe_alert.yml | 2 +-
 .github/workflows/tests.yml                | 4 ++--
 .github/workflows/universe_validation.yml  | 2 +-
 requirements.txt                           | 2 +-
 setup.cfg                                  | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml
index 910cfdc40ff..78a27cfa3ba 100644
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@@ -15,7 +15,7 @@ jobs:
         env:
           GITHUB_CONTEXT: ${{ toJson(github) }}
         run: echo "$GITHUB_CONTEXT"
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: actions/setup-python@v4
       - name: Install and run explosion-bot
         run: |
diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml
index f9fd3e81769..17d8989faa8 100644
--- a/.github/workflows/slowtests.yml
+++ b/.github/workflows/slowtests.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           ref: ${{ matrix.branch }}
       - name: Get commits from past 24 hours
diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml
index 33851fbcc18..01731ffe0d7 100644
--- a/.github/workflows/spacy_universe_alert.yml
+++ b/.github/workflows/spacy_universe_alert.yml
@@ -18,7 +18,7 @@ jobs:
         run: |
           echo "$GITHUB_CONTEXT"
 
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: actions/setup-python@v4
         with:
           python-version: '3.10'
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2a236b6bd3e..af115e817e9 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -25,7 +25,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Configure Python version
         uses: actions/setup-python@v4
@@ -74,7 +74,7 @@ jobs:
 
     steps:
       - name: Check out repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Configure Python version
         uses: actions/setup-python@v4
diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml
index 4d492500c57..ce7df49dbae 100644
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@@ -20,7 +20,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Configure Python version
         uses: actions/setup-python@v4
diff --git a/requirements.txt b/requirements.txt
index 4b58e75506d..94a9d17c0c3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
 srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
-typer>=0.3.0,<0.10.0
+typer>=0.3.0,<1.0.0
 weasel>=0.1.0,<0.5.0
 # Third party dependencies
 numpy>=1.15.0; python_version < "3.9"
diff --git a/setup.cfg b/setup.cfg
index a9459c84da7..2b41ab339c4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -44,7 +44,7 @@ install_requires =
     catalogue>=2.0.6,<2.1.0
     weasel>=0.1.0,<0.5.0
     # Third-party dependencies
-    typer>=0.3.0,<0.10.0
+    typer>=0.3.0,<1.0.0
     tqdm>=4.38.0,<5.0.0
     numpy>=1.15.0; python_version < "3.9"
     numpy>=1.19.0; python_version >= "3.9"

From fe70798fa70574c578b53d70ffebfe4c1d045666 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 30 Apr 2024 09:17:59 +0200
Subject: [PATCH 423/504] Update LICENSE to include 2024 (#13472)

---
 LICENSE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LICENSE b/LICENSE
index 979f5ade7b4..6cb7810c6ee 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 The MIT License (MIT)
 
-Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
+Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

From d8bff0f43a617f4181ebec4efaf4f7c4b4a10727 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 2 May 2024 16:46:41 +0200
Subject: [PATCH 424/504] fix docs for MorphAnalysis.__contains__ (#13433)

---
 website/docs/api/morphology.mdx | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/morphology.mdx b/website/docs/api/morphology.mdx
index 018ce25245e..7f6802034d2 100644
--- a/website/docs/api/morphology.mdx
+++ b/website/docs/api/morphology.mdx
@@ -147,9 +147,10 @@ Whether a feature/value pair is in the analysis.
 > assert "Feat1=Val1" in morph
 > ```
 
-| Name        | Description                                   |
-| ----------- | --------------------------------------------- |
-| **RETURNS** | A feature/value pair in the analysis. ~~str~~ |
+| Name         | Description                                                           |
+| ------------ | --------------------------------------------------------------------- |
+| `feature`    | A feature/value pair. ~~str~~                                         |
+| **RETURNS**  | Whether the feature/value pair is contained in the analysis. ~~bool~~ |
 
 ### MorphAnalysis.\_\_iter\_\_ {id="morphanalysis-iter",tag="method"}
 

From 6dc9dc0a7e80c62717a0b2cf3f5180cac604184c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 13 Sep 2022 09:51:12 +0200
Subject: [PATCH 425/504] Store activations in `Doc`s when `save_activations`
 is enabled (#11002)

* Store activations in Doc when `store_activations` is enabled

This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.

As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.

* Change type of `store_activations` to `Union[bool, List[str]]`

When the value is:

- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored

* Formatting fixes in Tagger

* Support store_activations in spancat and morphologizer

* Make Doc.activations type visible to MyPy

* textcat/textcat_multilabel: add store_activations option

* trainable_lemmatizer/entity_linker: add store_activations option

* parser/ner: do not currently support returning activations

* Extend tagger and senter tests

So that they, like the other tests, also check that we get no
activations if no activations were requested.

* Document `Doc.activations` and `store_activations` in the relevant pipes

* Start errors/warnings at higher numbers to avoid merge conflicts

Between the master and v4 branches.

* Add `store_activations` to docstrings.

* Replace store_activations setter by set_store_activations method

Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.

* Use dict comprehension suggested by @svlandeg

* Revert "Use dict comprehension suggested by @svlandeg"

This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.

* EntityLinker: add type annotations to _add_activations

* _store_activations: make kwarg-only, remove doc_scores_lens arg

* set_annotations: add type annotations

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* TextCat.predict: return dict

* Make the `TrainablePipe.store_activations` property a bool

This means that we can also bring back `store_activations` setter.

* Remove `TrainablePipe.activations`

We do not need to enumerate the activations anymore since `store_activations` is
`bool`.

* Add type annotations for activations in predict/set_annotations

* Rename `TrainablePipe.store_activations` to `save_activations`

* Error E1400 is not used anymore

This error was used when activations were still `Union[bool, List[str]]`.

* Change wording in API docs after store -> save change

* docs: tag (save_)activations as new in spaCy 4.0

* Fix copied line in morphologizer activations test

* Don't train in any test_save_activations test

* Rename activations

- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
  "guesses" -> "tree_ids".

* Remove unused W400 warning.

This warning was used when we still allowed the user to specify
which activations to save.

* Formatting fixes

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Replace "kb_ids" by a constant

* spancat: replace a cast by an assertion

* Fix EOF spacing

* Fix comments in test_save_activations tests

* Do not set RNG seed in activation saving tests

* Revert "spancat: replace a cast by an assertion"

This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/edit_tree_lemmatizer.py        |  47 +----
 spacy/pipeline/entity_linker.py               | 163 ++++++------------
 spacy/pipeline/morphologizer.pyx              |  42 ++---
 spacy/pipeline/senter.pyx                     |  41 ++---
 spacy/pipeline/spancat.py                     |  44 +----
 spacy/pipeline/tagger.pyx                     |  75 ++------
 spacy/pipeline/textcat.py                     |   4 +
 spacy/pipeline/textcat_multilabel.py          |  20 +--
 spacy/pipeline/trainable_pipe.pyx             |  76 +-------
 .../pipeline/test_edit_tree_lemmatizer.py     |  73 +-------
 spacy/tests/pipeline/test_entity_linker.py    |  61 +++----
 spacy/tests/pipeline/test_morphologizer.py    |  31 +---
 spacy/tests/pipeline/test_senter.py           |  31 ----
 spacy/tests/pipeline/test_tagger.py           |  53 +-----
 spacy/tests/pipeline/test_textcat.py          |  65 +------
 spacy/tokens/doc.pxd                          |   2 +
 spacy/tokens/doc.pyi                          |  20 ++-
 website/docs/api/doc.mdx                      |  26 +--
 website/docs/api/entitylinker.mdx             |  29 ++--
 website/docs/api/morphologizer.mdx            |  56 +-----
 20 files changed, 204 insertions(+), 755 deletions(-)

diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 54c880a7d89..2ef639cad52 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -1,10 +1,10 @@
 from collections import Counter
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast
 
 import numpy as np
 import srsly
-from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 from thinc.types import ArrayXd, Floats2d, Ints1d
 
 from .. import util
@@ -18,6 +18,10 @@
 from .lemmatizer import lemmatizer_score
 from .trainable_pipe import TrainablePipe
 
+# The cutoff value of *top_k* above which an alternative method is used to process guesses.
+TOP_K_GUARDRAIL = 20
+
+
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
 
@@ -50,7 +54,6 @@
         "top_k": 1,
         "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
         "save_activations": False,
-        "save_activations": False,
     },
     default_score_weights={"lemma_acc": 1.0},
 )
@@ -64,7 +67,6 @@ def make_edit_tree_lemmatizer(
     top_k: int,
     scorer: Optional[Callable],
     save_activations: bool,
-    save_activations: bool,
 ):
     """Construct an EditTreeLemmatizer component."""
     return EditTreeLemmatizer(
@@ -77,7 +79,6 @@ def make_edit_tree_lemmatizer(
         top_k=top_k,
         scorer=scorer,
         save_activations=save_activations,
-        save_activations=save_activations,
     )
 
 
@@ -98,7 +99,6 @@ def __init__(
         top_k: int = 1,
         scorer: Optional[Callable] = lemmatizer_score,
         save_activations: bool = False,
-        save_activations: bool = False,
     ):
         """
         Construct an edit tree lemmatizer.
@@ -111,7 +111,6 @@ def __init__(
         overwrite (bool): overwrite existing lemma annotations.
         top_k (int): try to apply at most the k most probable edit trees.
         save_activations (bool): save model activations in Doc when annotating.
-        save_activations (bool): save model activations in Doc when annotating.
         """
         self.vocab = vocab
         self.model = model
@@ -127,7 +126,6 @@ def __init__(
         self.cfg: Dict[str, Any] = {"labels": []}
         self.scorer = scorer
         self.save_activations = save_activations
-        self.save_activations = save_activations
 
     def get_loss(
         self, examples: Iterable[Example], scores: List[Floats2d]
@@ -156,25 +154,6 @@ def get_loss(
 
         return float(loss), d_scores
 
-    def get_teacher_student_loss(
-        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
-    ) -> Tuple[float, List[Floats2d]]:
-        """Calculate the loss and its gradient for a batch of student
-        scores, relative to teacher scores.
-
-        teacher_scores: Scores representing the teacher model's predictions.
-        student_scores: Scores representing the student model's predictions.
-
-        RETURNS (Tuple[float, float]): The loss and the gradient.
-
-        DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
-        """
-        loss_func = SequenceCategoricalCrossentropy(normalize=False)
-        d_scores, loss = loss_func(student_scores, teacher_scores)
-        if self.model.ops.xp.isnan(loss):
-            raise ValueError(Errors.E910.format(name=self.name))
-        return float(loss), d_scores
-
     def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         n_docs = len(list(docs))
         if not any(len(doc) for doc in docs):
@@ -186,21 +165,13 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
             scores: List[Floats2d] = [
                 self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
             ]
-            guesses: List[Ints1d] = [
-                self.model.ops.alloc((0,), dtype="i") for doc in docs
-            ]
-            scores: List[Floats2d] = [
-                self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
-            ]
             assert len(guesses) == n_docs
             return {"probabilities": scores, "tree_ids": guesses}
-            return {"probabilities": scores, "tree_ids": guesses}
         scores = self.model.predict(docs)
         assert len(scores) == n_docs
         guesses = scores2guesses(docs, scores)
         assert len(guesses) == n_docs
         return {"probabilities": scores, "tree_ids": guesses}
-        return {"probabilities": scores, "tree_ids": guesses}
 
     def _scores2guesses_top_k_equals_1(self, docs, scores):
         guesses = []
@@ -260,15 +231,9 @@ def _scores2guesses_top_k_guardrail(self, docs, scores):
 
         return guesses
 
-    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
-        batch_tree_ids = activations["tree_ids"]
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         batch_tree_ids = activations["tree_ids"]
         for i, doc in enumerate(docs):
-            if self.save_activations:
-                doc.activations[self.name] = {}
-                for act_name, acts in activations.items():
-                    doc.activations[self.name][act_name] = acts[i]
             if self.save_activations:
                 doc.activations[self.name] = {}
                 for act_name, acts in activations.items():
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index eb87d1db987..0f15ef38d45 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -6,37 +6,24 @@
 from itertools import islice
 import srsly
 import random
-import warnings
 from itertools import islice
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Union, cast
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
 import srsly
-from numpy import dtype
 from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate
-from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
+from thinc.types import Floats2d
 
-from ..kb import KnowledgeBase, Candidate
-from ..ml import empty_kb
-from ..tokens import Doc, Span
-from ..ml import empty_kb
-from ..tokens import Doc, Span, SpanGroup
-from .pipe import deserialize_config
-from .trainable_pipe import TrainablePipe
-from ..language import Language
-from ..vocab import Vocab
-from ..training import Example, validate_examples, validate_get_examples
-from ..errors import Errors
-from ..util import SimpleFrozenList, registry
 from .. import util
 from ..errors import Errors
 from ..kb import Candidate, KnowledgeBase
 from ..language import Language
 from ..scorer import Scorer
-from ..tokens import Doc, Span, SpanGroup
+from ..tokens import Doc, Span
 from ..training import Example, validate_examples, validate_get_examples
 from ..util import SimpleFrozenList, registry
 from ..vocab import Vocab
+from .legacy.entity_linker import EntityLinker_v1
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 
@@ -45,6 +32,9 @@
 
 KNOWLEDGE_BASE_IDS = "kb_ids"
 
+# See #9050
+BACKWARD_OVERWRITE = True
+
 default_model_config = """
 [model]
 @architectures = "spacy.EntityLinker.v2"
@@ -75,13 +65,13 @@
         "entity_vector_length": 64,
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
         "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
-        "overwrite": False,
+        "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
+        "overwrite": True,
         "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
         "use_gold_ents": True,
         "candidates_batch_size": 1,
         "threshold": None,
         "save_activations": False,
-        "save_activations": False,
     },
     default_score_weights={
         "nel_micro_f": 1.0,
@@ -101,7 +91,7 @@ def make_entity_linker(
     entity_vector_length: int,
     get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
     get_candidates_batch: Callable[
-        [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
+        [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
     ],
     generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
     overwrite: bool,
@@ -110,7 +100,6 @@ def make_entity_linker(
     candidates_batch_size: int,
     threshold: Optional[float] = None,
     save_activations: bool,
-    save_activations: bool,
 ):
     """Construct an EntityLinker component.
 
@@ -125,7 +114,7 @@ def make_entity_linker(
     get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
         produces a list of candidates, given a certain knowledge base and a textual mention.
     get_candidates_batch (
-        Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
+        Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
         ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
     generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
     scorer (Optional[Callable]): The scoring method.
@@ -135,11 +124,23 @@ def make_entity_linker(
     threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
         prediction is discarded. If None, predictions are not filtered by any threshold.
     save_activations (bool): save model activations in Doc when annotating.
-    save_activations (bool): save model activations in Doc when annotating.
     """
-    if not model.attrs.get("include_span_maker", False):
-        raise ValueError(Errors.E4005)
 
+    if not model.attrs.get("include_span_maker", False):
+        # The only difference in arguments here is that use_gold_ents and threshold aren't available.
+        return EntityLinker_v1(
+            nlp.vocab,
+            model,
+            name,
+            labels_discard=labels_discard,
+            n_sents=n_sents,
+            incl_prior=incl_prior,
+            incl_context=incl_context,
+            entity_vector_length=entity_vector_length,
+            get_candidates=get_candidates,
+            overwrite=overwrite,
+            scorer=scorer,
+        )
     return EntityLinker(
         nlp.vocab,
         model,
@@ -158,7 +159,6 @@ def make_entity_linker(
         candidates_batch_size=candidates_batch_size,
         threshold=threshold,
         save_activations=save_activations,
-        save_activations=save_activations,
     )
 
 
@@ -192,15 +192,15 @@ def __init__(
         entity_vector_length: int,
         get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
         get_candidates_batch: Callable[
-            [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
+            [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
         ],
-        overwrite: bool = False,
+        generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
+        overwrite: bool = BACKWARD_OVERWRITE,
         scorer: Optional[Callable] = entity_linker_score,
         use_gold_ents: bool,
         candidates_batch_size: int,
         threshold: Optional[float] = None,
         save_activations: bool = False,
-        save_activations: bool = False,
     ) -> None:
         """Initialize an entity linker.
 
@@ -216,10 +216,10 @@ def __init__(
         get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
             produces a list of candidates, given a certain knowledge base and a textual mention.
         get_candidates_batch (
-            Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]],
+            Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
             Iterable[Candidate]]
             ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
-        overwrite (bool): Whether to overwrite existing non-empty annotations.
+        generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
         scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
         use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
             component must provide entity annotations.
@@ -255,12 +255,9 @@ def __init__(
         self.candidates_batch_size = candidates_batch_size
         self.threshold = threshold
         self.save_activations = save_activations
-        self.save_activations = save_activations
 
         if candidates_batch_size < 1:
             raise ValueError(Errors.E1044)
-        if self.incl_prior and not self.kb.supports_prior_probs:
-            warnings.warn(Warnings.W401)
 
         def _score_with_ents_set(examples: Iterable[Example], **kwargs):
             # Because of how spaCy works, we can't just score immediately, because Language.evaluate
@@ -463,7 +460,6 @@ def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
         loss = loss / len(entity_encodings)
         return float(loss), out
 
-    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
     def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
         Returns the KB IDs for each entity in each doc, including NIL if there is
@@ -481,47 +477,39 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         xp = ops.xp
         docs_ents: List[Ragged] = []
         docs_scores: List[Ragged] = []
-        ops = self.model.ops
-        xp = ops.xp
-        docs_ents: List[Ragged] = []
-        docs_scores: List[Ragged] = []
         if not docs:
-            return {
-                KNOWLEDGE_BASE_IDS: final_kb_ids,
-                "ents": docs_ents,
-                "scores": docs_scores,
-            }
+            return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
         if isinstance(docs, Doc):
             docs = [docs]
-        for doc in docs:
-            doc_ents: List[Ints1d] = []
-            doc_scores: List[Floats1d] = []
         for doc in docs:
             doc_ents: List[Ints1d] = []
             doc_scores: List[Floats1d] = []
             if len(doc) == 0:
-                docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
-                docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
                 docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
                 docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
                 continue
             sentences = [s for s in doc.sents]
 
-            # Loop over entities in batches.
-            for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
-                ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
-
-                # Look up candidate entities.
-                valid_ent_idx = [
-                    idx
-                    for idx in range(len(ent_batch))
-                    if ent_batch[idx].label_ not in self.labels_discard
-                ]
-
-                batch_candidates = list(
-                    self.get_candidates_batch(
-                        self.kb,
-                        SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
+                if self.incl_context:
+                    # get n_neighbour sentences, clipped to the length of the document
+                    start_sentence = max(0, sent_index - self.n_sents)
+                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
+                    start_token = sentences[start_sentence].start
+                    end_token = sentences[end_sentence].end
+                    sent_doc = doc[start_token:end_token].as_doc()
+                    # currently, the context is the same for each entity in a sentence (should be refined)
+                    sentence_encoding = self.model.predict([sent_doc])[0]
+                    sentence_encoding_t = sentence_encoding.T
+                    sentence_norm = xp.linalg.norm(sentence_encoding_t)
+                entity_count += 1
+                if ent.label_ in self.labels_discard:
+                    # ignoring this entity - setting to NIL
+                    final_kb_ids.append(self.NIL)
+                    self._add_activations(
+                        doc_scores=doc_scores,
+                        doc_ents=doc_ents,
+                        scores=[0.0],
+                        ents=[0],
                     )
                 else:
                     candidates = list(self.get_candidates(self.kb, ent))
@@ -592,39 +580,23 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
                 method="predict", msg="result variables not of equal length"
             )
             raise RuntimeError(err)
-        return {
-            KNOWLEDGE_BASE_IDS: final_kb_ids,
-            "ents": docs_ents,
-            "scores": docs_scores,
-        }
+        return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
 
-    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
         activations (ActivationsT): The activations used for setting annotations, produced
                                  by EntityLinker.predict.
-        activations (ActivationsT): The activations used for setting annotations, produced
-                                 by EntityLinker.predict.
 
         DOCS: https://spacy.io/api/entitylinker#set_annotations
         """
         kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
-        kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
         count_ents = len([ent for doc in docs for ent in doc.ents])
         if count_ents != len(kb_ids):
             raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
         i = 0
         overwrite = self.cfg["overwrite"]
-        for j, doc in enumerate(docs):
-            if self.save_activations:
-                doc.activations[self.name] = {}
-                for act_name, acts in activations.items():
-                    if act_name != KNOWLEDGE_BASE_IDS:
-                        # We only copy activations that are Ragged.
-                        doc.activations[self.name][act_name] = cast(Ragged, acts[j])
-
         for j, doc in enumerate(docs):
             if self.save_activations:
                 doc.activations[self.name] = {}
@@ -760,32 +732,3 @@ def _add_activations(
         ops = self.model.ops
         doc_scores.append(ops.asarray1f(scores))
         doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
-
-    def _add_doc_activations(
-        self,
-        *,
-        docs_scores: List[Ragged],
-        docs_ents: List[Ragged],
-        doc_scores: List[Floats1d],
-        doc_ents: List[Ints1d],
-    ):
-        if not self.save_activations:
-            return
-        ops = self.model.ops
-        lengths = ops.asarray1i([s.shape[0] for s in doc_scores])
-        docs_scores.append(Ragged(ops.flatten(doc_scores), lengths))
-        docs_ents.append(Ragged(ops.flatten(doc_ents), lengths))
-
-    def _add_activations(
-        self,
-        *,
-        doc_scores: List[Floats1d],
-        doc_ents: List[Ints1d],
-        scores: Sequence[float],
-        ents: Sequence[int],
-    ):
-        if not self.save_activations:
-            return
-        ops = self.model.ops
-        doc_scores.append(ops.asarray1f(scores))
-        doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 443b6818dc2..cc8f87936b9 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,11 +1,10 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Callable, Dict, Iterable, List, Optional, Union
 import srsly
-from thinc.api import Model, Config
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.api import SequenceCategoricalCrossentropy, Model, Config
 from thinc.types import Floats2d, Ints1d
 from itertools import islice
-from typing import Callable, Dict, Iterable, Optional, Union
+from typing import Callable, Dict, Optional, Union
 
 from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 
@@ -24,10 +23,13 @@ from ..errors import Errors
 from ..language import Language
 from ..parts_of_speech import IDS as POS_IDS
 from ..scorer import Scorer
-from ..symbols import POS
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import ActivationsT, Tagger
+from .tagger import Tagger
+
+# See #9050
+BACKWARD_OVERWRITE = True
+BACKWARD_EXTEND = False
 
 default_model_config = """
 [model]
@@ -65,13 +67,6 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
         "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
         "save_activations": False,
     },
-    default_config={
-        "model": DEFAULT_MORPH_MODEL,
-        "overwrite": True,
-        "extend": False,
-        "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
-        "save_activations": False,
-    },
     default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
@@ -83,12 +78,9 @@ def make_morphologizer(
     label_smoothing: float,
     scorer: Optional[Callable],
     save_activations: bool,
-    save_activations: bool,
 ):
     return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
                          save_activations=save_activations)
-    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
-                         save_activations=save_activations)
 
 
 def morphologizer_score(examples, **kwargs):
@@ -120,11 +112,11 @@ class Morphologizer(Tagger):
         model: Model,
         name: str = "morphologizer",
         *,
-        overwrite: bool = False,
-        extend: bool = False,
+        overwrite: bool = BACKWARD_OVERWRITE,
+        extend: bool = BACKWARD_EXTEND,
+        label_smoothing: float = 0.0,
         scorer: Optional[Callable] = morphologizer_score,
         save_activations: bool = False,
-        save_activations: bool = False,
     ):
         """Initialize a morphologizer.
 
@@ -132,13 +124,10 @@ class Morphologizer(Tagger):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
-        overwrite (bool): Whether to overwrite existing annotations.
-        extend (bool): Whether to extend existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attributes "pos" and "morph" and
             Scorer.score_token_attr_per_feat for the attribute "morph".
         save_activations (bool): save model activations in Doc when annotating.
-        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/morphologizer#init
         """
@@ -160,7 +149,6 @@ class Morphologizer(Tagger):
         self.cfg = dict(sorted(cfg.items()))
         self.scorer = scorer
         self.save_activations = save_activations
-        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -254,18 +242,15 @@ class Morphologizer(Tagger):
         assert len(label_sample) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=doc_sample, Y=label_sample)
 
-    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
         activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
-        activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
 
         DOCS: https://spacy.io/api/morphologizer#set_annotations
         """
         batch_tag_ids = activations["label_ids"]
-        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
@@ -276,10 +261,6 @@ class Morphologizer(Tagger):
         # to allocate a compatible container out of the iterable.
         labels = tuple(self.labels)
         for i, doc in enumerate(docs):
-            if self.save_activations:
-                doc.activations[self.name] = {}
-                for act_name, acts in activations.items():
-                    doc.activations[self.name][act_name] = acts[i]
             if self.save_activations:
                 doc.activations[self.name] = {}
                 for act_name, acts in activations.items():
@@ -321,8 +302,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
-                                                    label_smoothing=self.cfg["label_smoothing"])
+        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 35627bbf2ad..521afe1d181 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,18 +1,25 @@
 # cython: infer_types=True, profile=True, binding=True
+from typing import Dict, Iterable, Optional, Callable, List, Union
 from itertools import islice
-from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Callable, Optional
 
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+import srsly
+from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
 
-from .. import util
+from .tagger import ActivationsT, Tagger
+from ..language import Language
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .tagger import ActivationsT, Tagger
+from .tagger import Tagger
+
+# See #9050
+BACKWARD_OVERWRITE = False
 
 default_model_config = """
 [model]
@@ -40,12 +47,6 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
         "scorer": {"@scorers": "spacy.senter_scorer.v1"},
         "save_activations": False,
     },
-    default_config={
-        "model": DEFAULT_SENTER_MODEL,
-        "overwrite": False,
-        "scorer": {"@scorers": "spacy.senter_scorer.v1"},
-        "save_activations": False,
-    },
     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
 def make_senter(nlp: Language,
@@ -55,13 +56,6 @@ def make_senter(nlp: Language,
                 scorer: Optional[Callable],
                 save_activations: bool):
     return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
-def make_senter(nlp: Language,
-                name: str,
-                model: Model,
-                overwrite: bool,
-                scorer: Optional[Callable],
-                save_activations: bool):
-    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
 
 
 def senter_score(examples, **kwargs):
@@ -89,10 +83,9 @@ class SentenceRecognizer(Tagger):
         model,
         name="senter",
         *,
-        overwrite=False,
+        overwrite=BACKWARD_OVERWRITE,
         scorer=senter_score,
         save_activations: bool = False,
-        save_activations: bool = False,
     ):
         """Initialize a sentence recognizer.
 
@@ -100,11 +93,9 @@ class SentenceRecognizer(Tagger):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
-        overwrite (bool): Whether to overwrite existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_spans for the attribute "sents".
         save_activations (bool): save model activations in Doc when annotating.
-        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/sentencerecognizer#init
         """
@@ -115,7 +106,6 @@ class SentenceRecognizer(Tagger):
         self.cfg = {"overwrite": overwrite}
         self.scorer = scorer
         self.save_activations = save_activations
-        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -133,27 +123,20 @@ class SentenceRecognizer(Tagger):
     def label_data(self):
         return None
 
-    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
         activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
-        activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
 
         DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
         """
         batch_tag_ids = activations["label_ids"]
-        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
         cdef bint overwrite = self.cfg["overwrite"]
         for i, doc in enumerate(docs):
-            if self.save_activations:
-                doc.activations[self.name] = {}
-                for act_name, acts in activations.items():
-                    doc.activations[self.name][act_name] = acts[i]
             if self.save_activations:
                 doc.activations[self.name] = {}
                 for act_name, acts in activations.items():
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 9d9415692a8..1450bb5d6cb 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,29 +1,14 @@
-from dataclasses import dataclass
-from functools import partial
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Protocol,
-    Tuple,
-    Union,
-    cast,
-    runtime_checkable,
-)
+from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
+from typing import Union
+from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
+from thinc.api import Optimizer
+from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
 
 import numpy
 from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
 from thinc.types import Floats2d, Ints1d, Ints2d, Ragged
 
-from ..scorer import Scorer
-from ..language import Language
-from .trainable_pipe import TrainablePipe
-from ..tokens import Doc, SpanGroup, Span
-from ..vocab import Vocab
-from ..training import Example, validate_examples
+from ..compat import Protocol, runtime_checkable
 from ..errors import Errors
 from ..language import Language
 from ..scorer import Scorer
@@ -36,9 +21,6 @@
 ActivationsT = Dict[str, Union[Floats2d, Ragged]]
 
 
-ActivationsT = Dict[str, Union[Floats2d, Ragged]]
-
-
 spancat_default_config = """
 [model]
 @architectures = "spacy.SpanCategorizer.v1"
@@ -194,7 +176,6 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester:
         "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
         "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
         "save_activations": False,
-        "save_activations": False,
     },
     default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@@ -208,7 +189,6 @@ def make_spancat(
     threshold: float,
     max_positive: Optional[int],
     save_activations: bool,
-    save_activations: bool,
 ) -> "SpanCategorizer":
     """Create a SpanCategorizer component and configure it for multi-label
     classification to be able to assign multiple labels for each span.
@@ -237,7 +217,6 @@ def make_spancat(
     max_positive (Optional[int]): Maximum number of labels to consider positive
         per span. Defaults to None, indicating no limit.
         save_activations (bool): save model activations in Doc when annotating.
-        save_activations (bool): save model activations in Doc when annotating.
     """
     return SpanCategorizer(
         nlp.vocab,
@@ -317,7 +296,6 @@ def make_spancat_singlelabel(
         threshold=None,
         scorer=scorer,
         save_activations=save_activations,
-        save_activations=save_activations,
     )
 
 
@@ -381,7 +359,6 @@ def __init__(
         threshold: Optional[float] = 0.5,
         scorer: Optional[Callable] = spancat_score,
         save_activations: bool = False,
-        save_activations: bool = False,
     ) -> None:
         """Initialize the multi-label or multi-class span categorizer.
 
@@ -432,7 +409,6 @@ def __init__(
         self.name = name
         self.scorer = scorer
         self.save_activations = save_activations
-        self.save_activations = save_activations
 
     @property
     def key(self) -> str:
@@ -490,7 +466,6 @@ def label_data(self) -> List[str]:
         """
         return list(self.labels)
 
-    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
     def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
 
@@ -502,8 +477,6 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
         indices = self.suggester(docs, ops=self.model.ops)
         scores = self.model.predict((docs, indices))  # type: ignore
         return {"indices": indices, "scores": scores}
-        scores = self.model.predict((docs, indices))  # type: ignore
-        return {"indices": indices, "scores": scores}
 
     def set_candidates(
         self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@@ -523,13 +496,11 @@ def set_candidates(
             for index in candidates.dataXd:
                 doc.spans[candidates_key].append(doc[index[0] : index[1]])
 
-    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
         """Modify a batch of Doc objects, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
         activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
-        activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
 
         DOCS: https://spacy.io/api/spancategorizer#set_annotations
         """
@@ -538,9 +509,10 @@ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> Non
         indices = activations["indices"]
         assert isinstance(indices, Ragged)
         scores = cast(Floats2d, activations["scores"])
+
         offset = 0
         for i, doc in enumerate(docs):
-            indices_i = cast(Ints2d, indices[i].dataXd)
+            indices_i = indices[i].dataXd
             if self.save_activations:
                 doc.activations[self.name] = {}
                 doc.activations[self.name]["indices"] = indices_i
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index ccd401b6af9..8ecd0c46ee0 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,37 +1,32 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Callable, Dict, Iterable, List, Optional, Union
-from typing import Tuple
 import numpy
 import srsly
-from thinc.api import Model, set_dropout_rate, Config
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
 from thinc.types import Floats2d, Ints1d
 import warnings
 from itertools import islice
-from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Callable, Optional
 
 import numpy
 from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
-from thinc.types import Floats2d, Ints1d
 
-from ..morphology cimport Morphology
 from ..tokens.doc cimport Doc
-from ..vocab cimport Vocab
 
 from .. import util
-from ..attrs import ID, POS
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..language import Language
-from ..parts_of_speech import X
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
-from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 
 
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 
+# See #9050
+BACKWARD_OVERWRITE = False
+
 default_model_config = """
 [model]
 @architectures = "spacy.Tagger.v2"
@@ -59,13 +54,6 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
         "neg_prefix": "!",
         "save_activations": False,
     },
-    default_config={
-        "model": DEFAULT_TAGGER_MODEL,
-        "overwrite": False,
-        "scorer": {"@scorers": "spacy.tagger_scorer.v1"},
-        "neg_prefix": "!",
-        "save_activations": False,
-    },
     default_score_weights={"tag_acc": 1.0},
 )
 def make_tagger(
@@ -76,7 +64,6 @@ def make_tagger(
     scorer: Optional[Callable],
     neg_prefix: str,
     save_activations: bool,
-    save_activations: bool,
 ):
     """Construct a part-of-speech tagger component.
 
@@ -87,8 +74,6 @@ def make_tagger(
     """
     return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
                   save_activations=save_activations)
-    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
-                  save_activations=save_activations)
 
 
 def tagger_score(examples, **kwargs):
@@ -111,11 +96,10 @@ class Tagger(TrainablePipe):
         model,
         name="tagger",
         *,
-        overwrite=False,
+        overwrite=BACKWARD_OVERWRITE,
         scorer=tagger_score,
         neg_prefix="!",
         save_activations: bool = False,
-        save_activations: bool = False,
     ):
         """Initialize a part-of-speech tagger.
 
@@ -123,11 +107,9 @@ class Tagger(TrainablePipe):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
-        overwrite (bool): Whether to overwrite existing annotations.
         scorer (Optional[Callable]): The scoring method. Defaults to
             Scorer.score_token_attr for the attribute "tag".
         save_activations (bool): save model activations in Doc when annotating.
-        save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/tagger#init
         """
@@ -139,7 +121,6 @@ class Tagger(TrainablePipe):
         self.cfg = dict(sorted(cfg.items()))
         self.scorer = scorer
         self.save_activations = save_activations
-        self.save_activations = save_activations
 
     @property
     def labels(self):
@@ -158,7 +139,6 @@ class Tagger(TrainablePipe):
         """Data about the labels currently added to the component."""
         return tuple(self.cfg["labels"])
 
-    def predict(self, docs) -> ActivationsT:
     def predict(self, docs) -> ActivationsT:
         """Apply the pipeline's model to a batch of docs, without modifying them.
 
@@ -173,13 +153,11 @@ class Tagger(TrainablePipe):
             guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
             assert len(guesses) == len(docs)
             return {"probabilities": guesses, "label_ids": guesses}
-            return {"probabilities": guesses, "label_ids": guesses}
         scores = self.model.predict(docs)
         assert len(scores) == len(docs), (len(scores), len(docs))
         guesses = self._scores2guesses(scores)
         assert len(guesses) == len(docs)
         return {"probabilities": scores, "label_ids": guesses}
-        return {"probabilities": scores, "label_ids": guesses}
 
     def _scores2guesses(self, scores):
         guesses = []
@@ -190,28 +168,21 @@ class Tagger(TrainablePipe):
             guesses.append(doc_guesses)
         return guesses
 
-    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
     def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
         """Modify a batch of documents, using pre-computed scores.
 
         docs (Iterable[Doc]): The documents to modify.
         activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
-        activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
 
         DOCS: https://spacy.io/api/tagger#set_annotations
         """
         batch_tag_ids = activations["label_ids"]
-        batch_tag_ids = activations["label_ids"]
         if isinstance(docs, Doc):
             docs = [docs]
         cdef Doc doc
         cdef bint overwrite = self.cfg["overwrite"]
         labels = self.labels
         for i, doc in enumerate(docs):
-            if self.save_activations:
-                doc.activations[self.name] = {}
-                for act_name, acts in activations.items():
-                    doc.activations[self.name][act_name] = acts[i]
             if self.save_activations:
                 doc.activations[self.name] = {}
                 for act_name, acts in activations.items():
@@ -271,6 +242,7 @@ class Tagger(TrainablePipe):
 
         DOCS: https://spacy.io/api/tagger#rehearse
         """
+        loss_func = SequenceCategoricalCrossentropy()
         if losses is None:
             losses = {}
         losses.setdefault(self.name, 0.0)
@@ -284,32 +256,12 @@ class Tagger(TrainablePipe):
         set_dropout_rate(self.model, drop)
         tag_scores, bp_tag_scores = self.model.begin_update(docs)
         tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
-        loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
+        grads, loss = loss_func(tag_scores, tutor_tag_scores)
         bp_tag_scores(grads)
-        if sgd is not None:
-            self.finish_update(sgd)
+        self.finish_update(sgd)
         losses[self.name] += loss
         return losses
 
-    def get_teacher_student_loss(
-        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
-    ) -> Tuple[float, List[Floats2d]]:
-        """Calculate the loss and its gradient for a batch of student
-        scores, relative to teacher scores.
-
-        teacher_scores: Scores representing the teacher model's predictions.
-        student_scores: Scores representing the student model's predictions.
-
-        RETURNS (Tuple[float, float]): The loss and the gradient.
-
-        DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
-        """
-        loss_func = SequenceCategoricalCrossentropy(normalize=False)
-        d_scores, loss = loss_func(student_scores, teacher_scores)
-        if self.model.ops.xp.isnan(loss):
-            raise ValueError(Errors.E910.format(name=self.name))
-        return float(loss), d_scores
-
     def get_loss(self, examples, scores):
         """Find the loss and gradient of loss for the batch of documents and
         their predicted scores.
@@ -321,12 +273,7 @@ class Tagger(TrainablePipe):
         DOCS: https://spacy.io/api/tagger#get_loss
         """
         validate_examples(examples, "Tagger.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(
-            names=self.labels,
-            normalize=False,
-            neg_prefix=self.cfg["neg_prefix"],
-            label_smoothing=self.cfg["label_smoothing"]
-        )
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"], label_smoothing=self.cfg["label_smoothing"])
         # Convert empty tag "" to missing value None so that both misaligned
         # tokens and tokens with missing annotation have the default missing
         # value None.
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 13841dd7bbb..79a98b9bc5f 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
+from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
+from thinc.types import Floats2d
+import numpy
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index e1c1fdc7a34..ac024ba3639 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,5 +1,9 @@
+from typing import Iterable, Optional, Dict, List, Callable, Any, Union
+from thinc.types import Floats2d
+from thinc.api import Model, Config
+
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional
 
 from thinc.api import Config, Model
 from thinc.types import Floats2d
@@ -80,8 +84,6 @@
         "model": DEFAULT_MULTI_TEXTCAT_MODEL,
         "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
         "save_activations": False,
-        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
-        "save_activations": False,
     },
     default_score_weights={
         "cats_score": 1.0,
@@ -103,9 +105,6 @@ def make_multilabel_textcat(
     threshold: float,
     scorer: Optional[Callable],
     save_activations: bool,
-) -> "TextCategorizer":
-    """Create a TextCategorizer component. The text categorizer predicts categories
-    save_activations: bool,
 ) -> "TextCategorizer":
     """Create a TextCategorizer component. The text categorizer predicts categories
     over a whole document. It can learn one or more labels, and the labels are considered
@@ -124,12 +123,6 @@ def make_multilabel_textcat(
         threshold=threshold,
         scorer=scorer,
         save_activations=save_activations,
-        nlp.vocab,
-        model,
-        name,
-        threshold=threshold,
-        scorer=scorer,
-        save_activations=save_activations,
     )
 
 
@@ -162,7 +155,6 @@ def __init__(
         threshold: float,
         scorer: Optional[Callable] = textcat_multilabel_score,
         save_activations: bool = False,
-        save_activations: bool = False,
     ) -> None:
         """Initialize a text categorizer for multi-label classification.
 
@@ -171,7 +163,6 @@ def __init__(
         name (str): The component instance name, used to add entries to the
             losses during training.
         threshold (float): Cutoff to consider a prediction "positive".
-        scorer (Optional[Callable]): The scoring method.
         save_activations (bool): save model activations in Doc when annotating.
 
         DOCS: https://spacy.io/api/textcategorizer#init
@@ -184,7 +175,6 @@ def __init__(
         self.cfg = dict(cfg)
         self.scorer = scorer
         self.save_activations = save_activations
-        self.save_activations = save_activations
 
     @property
     def support_missing_values(self):
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 546a1c48abb..bd360c9501b 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
 from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 
 import srsly
@@ -7,13 +7,13 @@ import warnings
 
 from ..tokens.doc cimport Doc
 
-from ..training import validate_examples, validate_distillation_examples
+from ..training import validate_examples
 from ..errors import Errors, Warnings
 from .pipe import Pipe, deserialize_config
 from .. import util
 from ..errors import Errors
 from ..language import Language
-from ..training import Example, validate_distillation_examples, validate_examples
+from ..training import Example, validate_examples
 from ..vocab import Vocab
 from .pipe import Pipe, deserialize_config
 
@@ -59,54 +59,7 @@ cdef class TrainablePipe(Pipe):
         except Exception as e:
             error_handler(self.name, self, [doc], e)
 
-
-    def distill(self,
-               teacher_pipe: Optional["TrainablePipe"],
-               examples: Iterable["Example"],
-               *,
-               drop: float=0.0,
-               sgd: Optional[Optimizer]=None,
-               losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
-        """Train a pipe (the student) on the predictions of another pipe
-        (the teacher). The student is typically trained on the probability
-        distribution of the teacher, but details may differ per pipe.
-
-        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
-            from.
-        examples (Iterable[Example]): Distillation examples. The reference
-            (teacher) and predicted (student) docs must have the same number of
-            tokens and the same orthography.
-        drop (float): dropout rate.
-        sgd (Optional[Optimizer]): An optimizer. Will be created via
-            create_optimizer if not set.
-        losses (Optional[Dict[str, float]]): Optional record of loss during
-            distillation.
-        RETURNS: The updated losses dictionary.
-
-        DOCS: https://spacy.io/api/pipe#distill
-        """
-        # By default we require a teacher pipe, but there are downstream
-        # implementations that don't require a pipe.
-        if teacher_pipe is None:
-            raise ValueError(Errors.E4002.format(name=self.name))
-        if losses is None:
-            losses = {}
-        losses.setdefault(self.name, 0.0)
-        validate_distillation_examples(examples, "TrainablePipe.distill")
-        set_dropout_rate(self.model, drop)
-        for node in teacher_pipe.model.walk():
-            if node.name == "softmax":
-                node.attrs["softmax_normalize"] = True
-        teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples])
-        student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples])
-        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
-        bp_student_scores(d_scores)
-        if sgd is not None:
-            self.finish_update(sgd)
-        losses[self.name] += loss
-        return losses
-
-    def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
+    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
         """Apply the pipe to a stream of documents. This usually happens under
         the hood when the nlp object is called on a text and all components are
         applied to the Doc.
@@ -219,19 +172,6 @@ cdef class TrainablePipe(Pipe):
         """
         raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
 
-    def get_teacher_student_loss(self, teacher_scores, student_scores):
-        """Calculate the loss and its gradient for a batch of student
-        scores, relative to teacher scores.
-
-        teacher_scores: Scores representing the teacher model's predictions.
-        student_scores: Scores representing the student model's predictions.
-
-        RETURNS (Tuple[float, float]): The loss and the gradient.
-
-        DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
-        """
-        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
-
     def create_optimizer(self) -> Optimizer:
         """Create an optimizer for the pipeline component.
 
@@ -268,14 +208,6 @@ cdef class TrainablePipe(Pipe):
         """
         raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
 
-    @property
-    def is_distillable(self) -> bool:
-        # Normally a pipe overrides `get_teacher_student_loss` to implement
-        # distillation. In more exceptional cases, a pipe can provide its
-        # own `distill` implementation. If neither of these methods is
-        # overridden, the pipe does not implement distillation.
-        return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss)
-
     @property
     def is_trainable(self) -> bool:
         return True
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index 0f925c0d4e1..ba2ed4e5ff3 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -1,5 +1,5 @@
-import pickle
 from typing import cast
+import pickle
 
 import hypothesis.strategies as st
 import pytest
@@ -10,6 +10,7 @@
 from spacy.language import Language
 from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
 from spacy.pipeline.trainable_pipe import TrainablePipe
+from spacy.training import Example
 from spacy.strings import StringStore
 from spacy.training import Example
 from spacy.util import make_tempdir
@@ -213,53 +214,6 @@ def test_overfitting_IO(top_k):
     assert doc4[3].lemma_ == "egg"
 
 
-def test_is_distillable():
-    nlp = English()
-    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
-    assert lemmatizer.is_distillable
-
-
-def test_distill():
-    teacher = English()
-    teacher_lemmatizer = teacher.add_pipe("trainable_lemmatizer")
-    teacher_lemmatizer.min_tree_freq = 1
-    train_examples = []
-    for t in TRAIN_DATA:
-        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
-
-    optimizer = teacher.initialize(get_examples=lambda: train_examples)
-
-    for i in range(50):
-        losses = {}
-        teacher.update(train_examples, sgd=optimizer, losses=losses)
-    assert losses["trainable_lemmatizer"] < 0.00001
-
-    student = English()
-    student_lemmatizer = student.add_pipe("trainable_lemmatizer")
-    student_lemmatizer.min_tree_freq = 1
-    student_lemmatizer.initialize(
-        get_examples=lambda: train_examples, labels=teacher_lemmatizer.label_data
-    )
-
-    distill_examples = [
-        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
-    ]
-
-    for i in range(50):
-        losses = {}
-        student_lemmatizer.distill(
-            teacher_lemmatizer, distill_examples, sgd=optimizer, losses=losses
-        )
-    assert losses["trainable_lemmatizer"] < 0.00001
-
-    test_text = "She likes blue eggs"
-    doc = student(test_text)
-    assert doc[0].lemma_ == "she"
-    assert doc[1].lemma_ == "like"
-    assert doc[2].lemma_ == "blue"
-    assert doc[3].lemma_ == "egg"
-
-
 def test_lemmatizer_requires_labels():
     nlp = English()
     nlp.add_pipe("trainable_lemmatizer")
@@ -403,26 +357,3 @@ def test_save_activations():
     ]
     assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
     assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
-
-
-def test_save_activations():
-    nlp = English()
-    lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer"))
-    lemmatizer.min_tree_freq = 1
-    train_examples = []
-    for t in TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.initialize(get_examples=lambda: train_examples)
-    nO = lemmatizer.model.get_dim("nO")
-
-    doc = nlp("This is a test.")
-    assert "trainable_lemmatizer" not in doc.activations
-
-    lemmatizer.save_activations = True
-    doc = nlp("This is a test.")
-    assert list(doc.activations["trainable_lemmatizer"].keys()) == [
-        "probabilities",
-        "tree_ids",
-    ]
-    assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
-    assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index a3ab80f7ee0..32e7a265f37 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,18 +1,17 @@
-from typing import Any, Callable, Dict, Iterable, cast
+from typing import Callable, Iterable, Dict, Any, cast
 
 import pytest
 from numpy.testing import assert_equal
 from thinc.types import Ragged
-from thinc.types import Ragged
 
 from spacy import Language, registry, util
 from spacy.attrs import ENT_KB_ID
 from spacy.compat import pickle
-from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
+from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates
 from spacy.lang.en import English
 from spacy.ml import load_kb
-from spacy.ml.models.entity_linker import build_span_maker, get_candidates
 from spacy.pipeline import EntityLinker, TrainablePipe
+from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tests.util import make_tempdir
@@ -454,17 +453,16 @@ def test_candidate_generation(nlp):
     mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
     # test the size of the relevant candidates
-    adam_ent_cands = get_candidates(mykb, adam_ent)
     assert len(get_candidates(mykb, douglas_ent)) == 2
-    assert len(adam_ent_cands) == 1
+    assert len(get_candidates(mykb, adam_ent)) == 1
     assert len(get_candidates(mykb, Adam_ent)) == 0  # default case sensitive
     assert len(get_candidates(mykb, shrubbery_ent)) == 0
 
     # test the content of the candidates
-    assert adam_ent_cands[0].entity_id_ == "Q2"
-    assert adam_ent_cands[0].alias == "adam"
-    assert_almost_equal(adam_ent_cands[0].entity_freq, 12)
-    assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9)
+    assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2"
+    assert get_candidates(mykb, adam_ent)[0].alias_ == "adam"
+    assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12)
+    assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
 
 
 def test_el_pipe_configuration(nlp):
@@ -492,7 +490,7 @@ def create_kb(vocab):
     assert doc[2].ent_kb_id_ == "Q2"
 
     def get_lowercased_candidates(kb, span):
-        return kb._get_alias_candidates(span.text.lower())
+        return kb.get_alias_candidates(span.text.lower())
 
     def get_lowercased_candidates_batch(kb, spans):
         return [get_lowercased_candidates(kb, span) for span in spans]
@@ -551,22 +549,24 @@ def test_vocab_serialization(nlp):
     mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
     adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
-    candidates = mykb._get_alias_candidates("adam")
+    candidates = mykb.get_alias_candidates("adam")
     assert len(candidates) == 1
-    assert candidates[0].entity_id == q2_hash
-    assert candidates[0].entity_id_ == "Q2"
-    assert candidates[0].alias == "adam"
+    assert candidates[0].entity == q2_hash
+    assert candidates[0].entity_ == "Q2"
+    assert candidates[0].alias == adam_hash
+    assert candidates[0].alias_ == "adam"
 
     with make_tempdir() as d:
         mykb.to_disk(d / "kb")
         kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
         kb_new_vocab.from_disk(d / "kb")
 
-        candidates = kb_new_vocab._get_alias_candidates("adam")
+        candidates = kb_new_vocab.get_alias_candidates("adam")
         assert len(candidates) == 1
-        assert candidates[0].entity_id == q2_hash
-        assert candidates[0].entity_id_ == "Q2"
-        assert candidates[0].alias == "adam"
+        assert candidates[0].entity == q2_hash
+        assert candidates[0].entity_ == "Q2"
+        assert candidates[0].alias == adam_hash
+        assert candidates[0].alias_ == "adam"
 
         assert kb_new_vocab.get_vector("Q2") == [2]
         assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
@@ -586,20 +586,20 @@ def test_append_alias(nlp):
     mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
     # test the size of the relevant candidates
-    assert len(mykb._get_alias_candidates("douglas")) == 2
+    assert len(mykb.get_alias_candidates("douglas")) == 2
 
     # append an alias
     mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
 
     # test the size of the relevant candidates has been incremented
-    assert len(mykb._get_alias_candidates("douglas")) == 3
+    assert len(mykb.get_alias_candidates("douglas")) == 3
 
     # append the same alias-entity pair again should not work (will throw a warning)
     with pytest.warns(UserWarning):
         mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
 
     # test the size of the relevant candidates remained unchanged
-    assert len(mykb._get_alias_candidates("douglas")) == 3
+    assert len(mykb.get_alias_candidates("douglas")) == 3
 
 
 @pytest.mark.filterwarnings("ignore:\\[W036")
@@ -999,11 +999,11 @@ def test_kb_to_bytes():
     assert kb_2.contains_alias("Russ Cochran")
     assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
     assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
-    assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
-        kb_2._get_alias_candidates("Russ Cochran")
+    assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
+        kb_2.get_alias_candidates("Russ Cochran")
     )
-    assert len(kb_1._get_alias_candidates("Randomness")) == len(
-        kb_2._get_alias_candidates("Randomness")
+    assert len(kb_1.get_alias_candidates("Randomness")) == len(
+        kb_2.get_alias_candidates("Randomness")
     )
 
 
@@ -1084,6 +1084,7 @@ def test_scorer_links():
 @pytest.mark.parametrize(
     "name,config",
     [
+        ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
         ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
     ],
 )
@@ -1110,7 +1111,10 @@ def create_kb(vocab):
         return mykb
 
     entity_linker = nlp.add_pipe(name, config={"model": config})
-    assert isinstance(entity_linker, EntityLinker)
+    if config["@architectures"] == "spacy.EntityLinker.v1":
+        assert isinstance(entity_linker, EntityLinker_v1)
+    else:
+        assert isinstance(entity_linker, EntityLinker)
     entity_linker.set_kb(create_kb)
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
 
@@ -1293,7 +1297,6 @@ def create_kb(vocab):
     assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
 
 
-def test_save_activations():
 def test_save_activations():
     nlp = English()
     vector_length = 3
@@ -1309,7 +1312,7 @@ def create_kb(vocab):
         # create artificial KB - assign same prior weight to the two russ cochran's
         # Q2146908 (Russ Cochran): American golfer
         # Q7381115 (Russ Cochran): publisher
-        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
         mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
         mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
         mykb.add_alias(
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 9a6bbc9fc60..c2b65977ac3 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -1,5 +1,4 @@
 from typing import cast
-
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import get_current_ops
@@ -10,7 +9,7 @@
 from spacy.language import Language
 from spacy.morphology import Morphology
 from spacy.pipeline import TrainablePipe
-from spacy.tests.util import make_tempdir
+from spacy.attrs import MORPH
 from spacy.tokens import Doc
 from spacy.training import Example
 
@@ -78,12 +77,6 @@ def test_implicit_label():
     nlp.initialize(get_examples=lambda: train_examples)
 
 
-def test_is_distillable():
-    nlp = English()
-    morphologizer = nlp.add_pipe("morphologizer")
-    assert morphologizer.is_distillable
-
-
 def test_no_resize():
     nlp = Language()
     morphologizer = nlp.add_pipe("morphologizer")
@@ -255,25 +248,3 @@ def test_save_activations():
     }
     assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
     assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
-
-
-def test_save_activations():
-    nlp = English()
-    morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
-    train_examples = []
-    for inst in TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
-    nlp.initialize(get_examples=lambda: train_examples)
-
-    doc = nlp("This is a test.")
-    assert "morphologizer" not in doc.activations
-
-    morphologizer.save_activations = True
-    doc = nlp("This is a test.")
-    assert "morphologizer" in doc.activations
-    assert set(doc.activations["morphologizer"].keys()) == {
-        "label_ids",
-        "probabilities",
-    }
-    assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
-    assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 9a798eae890..2e40d86ff48 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -1,5 +1,4 @@
 from typing import cast
-
 import pytest
 from numpy.testing import assert_equal
 
@@ -8,17 +7,10 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
-from spacy.pipeline import TrainablePipe
 from spacy.tests.util import make_tempdir
 from spacy.training import Example
 
 
-def test_is_distillable():
-    nlp = English()
-    senter = nlp.add_pipe("senter")
-    assert senter.is_distillable
-
-
 def test_label_types():
     nlp = Language()
     senter = nlp.add_pipe("senter")
@@ -134,26 +126,3 @@ def test_save_activations():
     assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
     assert doc.activations["senter"]["probabilities"].shape == (5, nO)
     assert doc.activations["senter"]["label_ids"].shape == (5,)
-
-
-def test_save_activations():
-    # Test if activations are correctly added to Doc when requested.
-    nlp = English()
-    senter = cast(TrainablePipe, nlp.add_pipe("senter"))
-
-    train_examples = []
-    for t in TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-
-    nlp.initialize(get_examples=lambda: train_examples)
-    nO = senter.model.get_dim("nO")
-
-    doc = nlp("This is a test.")
-    assert "senter" not in doc.activations
-
-    senter.save_activations = True
-    doc = nlp("This is a test.")
-    assert "senter" in doc.activations
-    assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
-    assert doc.activations["senter"]["probabilities"].shape == (5, nO)
-    assert doc.activations["senter"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 05e814f0733..5deb323dd71 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,5 +1,4 @@
 from typing import cast
-
 import pytest
 from numpy.testing import assert_almost_equal, assert_equal
 from thinc.api import compounding, get_current_ops
@@ -9,7 +8,7 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
-from spacy.training import Example
+from thinc.api import compounding
 
 from ..util import make_tempdir
 
@@ -25,9 +24,7 @@ def test_issue4348():
     optimizer = nlp.initialize()
     for i in range(5):
         losses = {}
-        batches = util.minibatch(
-            TRAIN_DATA, size=compounding(4.0, 32.0, 1.001).to_generator()
-        )
+        batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
         for batch in batches:
             nlp.update(batch, sgd=optimizer, losses=losses)
 
@@ -240,52 +237,6 @@ def test_overfitting_IO():
     assert doc3[0].tag_ != "N"
 
 
-def test_is_distillable():
-    nlp = English()
-    tagger = nlp.add_pipe("tagger")
-    assert tagger.is_distillable
-
-
-def test_distill():
-    teacher = English()
-    teacher_tagger = teacher.add_pipe("tagger")
-    train_examples = []
-    for t in TRAIN_DATA:
-        train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
-
-    optimizer = teacher.initialize(get_examples=lambda: train_examples)
-
-    for i in range(50):
-        losses = {}
-        teacher.update(train_examples, sgd=optimizer, losses=losses)
-    assert losses["tagger"] < 0.00001
-
-    student = English()
-    student_tagger = student.add_pipe("tagger")
-    student_tagger.min_tree_freq = 1
-    student_tagger.initialize(
-        get_examples=lambda: train_examples, labels=teacher_tagger.label_data
-    )
-
-    distill_examples = [
-        Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
-    ]
-
-    for i in range(50):
-        losses = {}
-        student_tagger.distill(
-            teacher_tagger, distill_examples, sgd=optimizer, losses=losses
-        )
-    assert losses["tagger"] < 0.00001
-
-    test_text = "I like blue eggs"
-    doc = student(test_text)
-    assert doc[0].tag_ == "N"
-    assert doc[1].tag_ == "V"
-    assert doc[2].tag_ == "J"
-    assert doc[3].tag_ == "N"
-
-
 def test_save_activations():
     # Test if activations are correctly added to Doc when requested.
     nlp = English()
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index f834597fafe..710dac0571d 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,5 +1,5 @@
-import random
 from typing import cast
+import random
 
 import numpy.random
 import pytest
@@ -13,16 +13,12 @@
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TextCategorizer, TrainablePipe
-from spacy.pipeline.textcat import (
-    single_label_bow_config,
-    single_label_cnn_config,
-    single_label_default_config,
-)
-from spacy.pipeline.textcat_multilabel import (
-    multi_label_bow_config,
-    multi_label_cnn_config,
-    multi_label_default_config,
-)
+from spacy.pipeline.textcat import single_label_bow_config
+from spacy.pipeline.textcat import single_label_cnn_config
+from spacy.pipeline.textcat import single_label_default_config
+from spacy.pipeline.textcat_multilabel import multi_label_bow_config
+from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
+from spacy.pipeline.textcat_multilabel import multi_label_default_config
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, DocBin
@@ -104,9 +100,7 @@ def test_issue3611():
         optimizer = nlp.initialize()
         for i in range(3):
             losses = {}
-            batches = util.minibatch(
-                train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
-            )
+            batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
 
             for batch in batches:
                 nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
@@ -143,9 +137,7 @@ def test_issue4030():
         optimizer = nlp.initialize()
         for i in range(3):
             losses = {}
-            batches = util.minibatch(
-                train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
-            )
+            batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
 
             for batch in batches:
                 nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
@@ -304,7 +296,6 @@ def test_issue9904():
 
     examples = get_examples()
     scores = textcat.predict([eg.predicted for eg in examples])["probabilities"]
-    scores = textcat.predict([eg.predicted for eg in examples])["probabilities"]
 
     loss = textcat.get_loss(examples, scores)[0]
     loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0]
@@ -474,8 +465,6 @@ def test_no_resize(name, textcat_config):
         # CNN
         ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
-        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
     ],
 )
 # fmt: on
@@ -613,12 +602,6 @@ def test_initialize_examples(name, get_examples, train_data):
         nlp.initialize(get_examples=get_examples())
 
 
-def test_is_distillable():
-    nlp = English()
-    textcat = nlp.add_pipe("textcat")
-    assert not textcat.is_distillable
-
-
 def test_overfitting_IO():
     # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
     fix_random_seed(0)
@@ -963,11 +946,9 @@ def test_textcat_multi_threshold():
     assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
 
 
-def test_save_activations():
 def test_save_activations():
     nlp = English()
     textcat = cast(TrainablePipe, nlp.add_pipe("textcat"))
-    textcat = cast(TrainablePipe, nlp.add_pipe("textcat"))
 
     train_examples = []
     for text, annotations in TRAIN_DATA_SINGLE_LABEL:
@@ -984,34 +965,6 @@ def test_save_activations():
     assert doc.activations["textcat"]["probabilities"].shape == (nO,)
 
 
-def test_save_activations_multi():
-    nlp = English()
-    textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel"))
-
-    train_examples = []
-    for text, annotations in TRAIN_DATA_MULTI_LABEL:
-        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-    nlp.initialize(get_examples=lambda: train_examples)
-    nO = textcat.model.get_dim("nO")
-
-    doc = nlp("This is a test.")
-    assert "textcat_multilabel" not in doc.activations
-
-    textcat.save_activations = True
-    doc = nlp("This is a test.")
-    assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
-    assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
-    nO = textcat.model.get_dim("nO")
-
-    doc = nlp("This is a test.")
-    assert "textcat" not in doc.activations
-
-    textcat.save_activations = True
-    doc = nlp("This is a test.")
-    assert list(doc.activations["textcat"].keys()) == ["probabilities"]
-    assert doc.activations["textcat"]["probabilities"].shape == (nO,)
-
-
 def test_save_activations_multi():
     nlp = English()
     textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel"))
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 9fb6a72c87f..fc0404f1423 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -50,6 +50,8 @@ cdef class Doc:
 
     cdef public dict activations
 
+    cdef public dict activations
+
     cdef public dict user_hooks
     cdef public dict user_token_hooks
     cdef public dict user_span_hooks
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index dc7c0143029..5fda6f2f789 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -8,6 +8,7 @@ from typing import (
     List,
     Optional,
     Protocol,
+    Sequence,
     Tuple,
     Union,
     overload,
@@ -16,15 +17,20 @@ from typing import (
 import numpy as np
 from cymem.cymem import Pool
 from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
-
+from .span import Span
+from .token import Token
+from .span_groups import SpanGroups
+from .retokenizer import Retokenizer
 from ..lexeme import Lexeme
 from ..vocab import Vocab
-from .retokenizer import Retokenizer
+from ._dict_proxies import SpanGroups
+from ._retokenize import Retokenizer
 from .span import Span
-from .span_groups import SpanGroups
 from .token import Token
 from .underscore import Underscore
 
+DOCBIN_ALL_ATTRS: Tuple[str, ...]
+
 class DocMethod(Protocol):
     def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ...  # type: ignore[misc]
 
@@ -34,6 +40,7 @@ class Doc:
     spans: SpanGroups
     max_length: int
     length: int
+    sentiment: float
     activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
     cats: Dict[str, float]
     user_hooks: Dict[str, Callable[..., Any]]
@@ -118,7 +125,6 @@ class Doc:
         start_idx: int,
         end_idx: int,
         label: Union[int, str] = ...,
-        *,
         kb_id: Union[int, str] = ...,
         vector: Optional[Floats1d] = ...,
         alignment_mode: str = ...,
@@ -146,12 +152,12 @@ class Doc:
         blocked: Optional[List[Span]] = ...,
         missing: Optional[List[Span]] = ...,
         outside: Optional[List[Span]] = ...,
-        default: str = ...,
+        default: str = ...
     ) -> None: ...
     @property
-    def noun_chunks(self) -> Tuple[Span]: ...
+    def noun_chunks(self) -> Iterator[Span]: ...
     @property
-    def sents(self) -> Tuple[Span]: ...
+    def sents(self) -> Iterator[Span]: ...
     @property
     def lang(self) -> int: ...
     @property
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index e92c0e833e0..310ce0dc88d 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -214,7 +214,6 @@ alignment mode `"strict".
 | `start`                                  | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
 | `end`                                    | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
 | `label`                                  | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
-| _keyword-only_                           |                                                                                                                                                                                                                                                                              |
 | `kb_id`                                  | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
 | `vector`                                 | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
 | `alignment_mode`                         | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
@@ -654,10 +653,11 @@ the [`TextCategorizer`](/api/textcategorizer).
 
 ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
 
-Returns a tuple of the base noun phrases in the doc, if the document has been
-syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
-does not permit other NPs to be nested within it – so no NP-level coordination,
-no prepositional phrases, and no relative clauses.
+Iterate over the base noun phrases in the document. Yields base noun-phrase
+`Span` objects, if the document has been syntactically parsed. A base noun
+phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
+nested within it – so no NP-level coordination, no prepositional phrases, and no
+relative clauses.
 
 To customize the noun chunk iterator in a loaded pipeline, modify
 [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
@@ -674,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised.
 > assert chunks[1].text == "another phrase"
 > ```
 
-| Name        | Description                                  |
-| ----------- | -------------------------------------------- |
-| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
+| Name       | Description                           |
+| ---------- | ------------------------------------- |
+| **YIELDS** | Noun chunks in the document. ~~Span~~ |
 
 ## Doc.sents {id="sents",tag="property",model="sentences"}
 
-Returns a tuple of the sentences in the document. Sentence spans have no label.
+Iterate over the sentences in the document. Sentence spans have no label.
 
 This property is only available when
 [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
@@ -696,9 +696,9 @@ will raise an error otherwise.
 > assert [s.root.text for s in sents] == ["is", "'s"]
 > ```
 
-| Name        | Description                                |
-| ----------- | ------------------------------------------ |
-| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
+| Name       | Description                         |
+| ---------- | ----------------------------------- |
+| **YIELDS** | Sentences in the document. ~~Span~~ |
 
 ## Doc.has_vector {id="has_vector",tag="property",model="vectors"}
 
@@ -762,6 +762,7 @@ The L2 norm of the document's vector representation.
 | `user_data`                                | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                               |
 | `lang` <Tag variant="new">2.1</Tag>        | Language of the document's vocabulary. ~~int~~                                                                                                 |
 | `lang_` <Tag variant="new">2.1</Tag>       | Language of the document's vocabulary. ~~str~~                                                                                                 |
+| `sentiment`                                | The document's positivity/negativity score, if available. ~~float~~                                                                            |
 | `user_hooks`                               | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                      |
 | `user_token_hooks`                         | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                              |
 | `user_span_hooks`                          | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                               |
@@ -785,6 +786,7 @@ serialization by passing in the string names via the `exclude` argument.
 | Name               | Description                                   |
 | ------------------ | --------------------------------------------- |
 | `text`             | The value of the `Doc.text` attribute.        |
+| `sentiment`        | The value of the `Doc.sentiment` attribute.   |
 | `tensor`           | The value of the `Doc.tensor` attribute.      |
 | `user_data`        | The value of the `Doc.user_data` dictionary.  |
 | `user_data_keys`   | The keys of the `Doc.user_data` dictionary.   |
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index f4b83d88bbf..fe720a60dd1 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -53,21 +53,20 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("entity_linker", config=config)
 > ```
 
-| Setting                                             | Description                                                                                                                                                                                                                                                                                                      |
-| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels_discard`                                    | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                                                   |
-| `n_sents`                                           | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                                                |
-| `incl_prior`                                        | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                             |
-| `incl_context`                                      | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                           |
-| `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           |
-| `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    |
-| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~                                                                                                                            |
-| `get_candidates`                                    | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                         |
-| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
-| `generate_empty_kb` <Tag variant="new">3.5.1</Tag>  | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           |
-| `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         |
-| `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          |
-| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                     |
+| Setting                                         | Description                                                                                                                                                                                                                                                                                 |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `labels_discard`                                | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                              |
+| `n_sents`                                       | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                           |
+| `incl_prior`                                    | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                        |
+| `incl_context`                                  | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                      |
+| `model`                                         | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                      |
+| `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
+| `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
+| `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                    |
+| `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
+| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
+| `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 9514bc773b9..1fda807cb32 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters.
 | Setting                                         | Description                                                                                                                                                                                                                                                            |
 | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `model`                                         | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                 |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
 | `extend` <Tag variant="new">3.2</Tag>           | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
 | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                       |
@@ -121,39 +121,6 @@ delegate to the [`predict`](/api/morphologizer#predict) and
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
-## Morphologizer.distill {id="distill", tag="method,experimental", version="4"}
-
-Train a pipe (the student) on the predictions of another pipe (the teacher). The
-student is typically trained on the probability distribution of the teacher, but
-details may differ per pipe. The goal of distillation is to transfer knowledge
-from the teacher to the student.
-
-The distillation is performed on ~~Example~~ objects. The `Example.reference`
-and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
-same orthography. Even though the reference does not need have to have gold
-annotations, the teacher could adds its own annotations when necessary.
-
-This feature is experimental.
-
-> #### Example
->
-> ```python
-> teacher_pipe = teacher.add_pipe("morphologizer")
-> student_pipe = student.add_pipe("morphologizer")
-> optimizer = nlp.resume_training()
-> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
-> ```
-
-| Name           | Description                                                                                                                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                                                                 |
-| `examples`     | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ |                                                                                                                                                                                             |
-| `drop`         | Dropout rate. ~~float~~                                                                                                                                                                     |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
-| `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                                |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
-
 ## Morphologizer.pipe {id="pipe",tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
@@ -292,27 +259,6 @@ predicted scores.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
-## Morphologizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
-
-Calculate the loss and its gradient for the batch of student scores relative to
-the teacher scores.
-
-> #### Example
->
-> ```python
-> teacher_morphologizer = teacher.get_pipe("morphologizer")
-> student_morphologizer = student.add_pipe("morphologizer")
-> student_scores = student_morphologizer.predict([eg.predicted for eg in examples])
-> teacher_scores = teacher_morphologizer.predict([eg.predicted for eg in examples])
-> loss, d_loss = student_morphologizer.get_teacher_student_loss(teacher_scores, student_scores)
-> ```
-
-| Name             | Description                                                                 |
-| ---------------- | --------------------------------------------------------------------------- |
-| `teacher_scores` | Scores representing the teacher model's predictions.                        |
-| `student_scores` | Scores representing the student model's predictions.                        |
-| **RETURNS**      | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
-
 ## Morphologizer.create_optimizer {id="create_optimizer",tag="method"}
 
 Create an optimizer for the pipeline component.

From 0ede5b70bd030423bb0761f6cf794d64164ac2f9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 24 Oct 2022 09:11:35 +0200
Subject: [PATCH 426/504] Replace EntityRuler with SpanRuler implementation
 (#11320)

* Replace EntityRuler with SpanRuler implementation

Remove `EntityRuler` and rename the `SpanRuler`-based
`future_entity_ruler` to `entity_ruler`.

Main changes:

* It is no longer possible to load patterns on init as with
`EntityRuler(patterns=)`.
* The older serialization formats (`patterns.jsonl`) are no longer
supported and the related tests are removed.
* The config settings are only stored in the config, not in the
serialized component (in particular the `phrase_matcher_attr` and
overwrite settings).

* Add migration guide to EntityRuler API docs

* docs update

* Minor edit

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/pipeline/span_ruler.py                  |   8 +
 spacy/tests/pipeline/test_entity_ruler.py     |   6 +
 .../serialize/test_serialize_pipeline.py      |  19 +-
 website/docs/api/entityruler.mdx              | 313 +++---------------
 4 files changed, 69 insertions(+), 277 deletions(-)

diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index cd8fea36b47..3f876598013 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -17,6 +17,14 @@
 
 import srsly
 
+from .pipe import Pipe
+from ..training import Example
+from ..language import Language
+from ..errors import Errors, Warnings
+from ..util import ensure_path, SimpleFrozenList, registry
+from ..tokens import Doc, Span
+from ..scorer import Scorer, get_ner_prf
+from ..matcher import Matcher, PhraseMatcher
 from .. import util
 from ..errors import Errors, Warnings
 from ..language import Language
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 12f2c9def2d..74731140688 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -2,6 +2,12 @@
 from thinc.api import NumpyOps, get_current_ops
 
 from spacy import registry
+from spacy.tokens import Doc, Span
+from spacy.language import Language
+from spacy.lang.en import English
+from spacy.pipeline import EntityRecognizer, merge_entities
+from spacy.pipeline import SpanRuler
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
 from spacy.lang.en import English
 from spacy.language import Language
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index dd3d32571b1..8170488f758 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -8,14 +8,9 @@
 from spacy import Vocab, load, registry
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.pipeline import (
-    DependencyParser,
-    EntityRecognizer,
-    SentenceRecognizer,
-    Tagger,
-    TextCategorizer,
-    TrainablePipe,
-)
+from spacy.pipeline import DependencyParser, EntityRecognizer
+from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer
+from spacy.pipeline import TrainablePipe
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
@@ -92,17 +87,12 @@ def test_issue_3526_1(en_vocab):
     nlp = Language(vocab=en_vocab)
     ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
     ruler.add_patterns(patterns)
-    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
-    ruler.add_patterns(patterns)
     ruler_bytes = ruler.to_bytes()
     assert len(ruler) == len(patterns)
     assert len(ruler.labels) == 4
     new_ruler = nlp.add_pipe(
         "entity_ruler", name="new_ruler", config={"overwrite_ents": True}
     )
-    new_ruler = nlp.add_pipe(
-        "entity_ruler", name="new_ruler", config={"overwrite_ents": True}
-    )
     new_ruler = new_ruler.from_bytes(ruler_bytes)
     assert len(new_ruler) == len(ruler)
     assert len(new_ruler.labels) == 4
@@ -126,7 +116,6 @@ def test_issue_3526_4(en_vocab):
 
 @pytest.mark.issue(4042)
 def test_issue4042():
-    """Test that serialization of an entity_ruler before NER works fine."""
     """Test that serialization of an entity_ruler before NER works fine."""
     nlp = English()
     # add ner pipe
@@ -192,7 +181,7 @@ def test_issue4042_bug2():
 @pytest.mark.issue(4725)
 def test_issue4725_1():
     """Ensure the pickling of the NER goes well"""
-    vocab = Vocab()
+    vocab = Vocab(vectors_name="test_vocab_add_vector")
     nlp = English(vocab=vocab)
     config = {
         "update_with_oracle_cut_size": 111,
diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx
index 335e87676c7..bc9ec050323 100644
--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@@ -1,13 +1,24 @@
 ---
 title: EntityRuler
-tag: class
-source: spacy/pipeline/entityruler.py
-version: 2.1
+new: 2.1
 teaser: 'Pipeline component for rule-based named entity recognition'
 api_string_name: entity_ruler
 api_trainable: false
 ---
 
+<Infobox title="New in v4" variant="warning">
+
+As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is
+implemented as a special case of the `SpanRuler` component.
+
+See the [migration guide](#migrating) below for differences between the v3
+`EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler`
+component.
+
+See the [`SpanRuler`](/api/spanruler) API docs for the full API.
+
+</Infobox>
+
 The entity ruler lets you add spans to the [`Doc.ents`](/api/doc#ents) using
 token-based rules or exact phrase matches. It can be combined with the
 statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or
@@ -64,273 +75,51 @@ how the component should be configured. You can override its settings via the
 | `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                       |
 | `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                 |
 
-```python
-%%GITHUB_SPACY/spacy/pipeline/entityruler.py
-```
-
-## EntityRuler.\_\_init\_\_ {id="init",tag="method"}
-
-Initialize the entity ruler. If patterns are supplied here, they need to be a
-list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either
-be a token pattern (list) or a phrase pattern (string). For example:
-`{"label": "ORG", "pattern": "Apple"}`.
-
-> #### Example
->
-> ```python
-> # Construction via add_pipe
-> ruler = nlp.add_pipe("entity_ruler")
->
-> # Construction from class
-> from spacy.pipeline import EntityRuler
-> ruler = EntityRuler(nlp, overwrite_ents=True)
-> ```
-
-| Name                                                 | Description                                                                                                                                                                                                                           |
-| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `nlp`                                                | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~                                                                                                                                     |
-| `name` <Tag variant="new">3</Tag>                    | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
-| _keyword-only_                                       |                                                                                                                                                                                                                                       |
-| `phrase_matcher_attr`                                | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~                                         |
-| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~                                                                                           |
-| `validate`                                           | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~                                                                                                                |
-| `overwrite_ents`                                     | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~                                                                                             |
-| `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                                                               |
-| `patterns`                                           | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~                                                                                                                                 |
-| `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                                                         |
-
-## EntityRuler.initialize {id="initialize",tag="method",version="3"}
-
-Initialize the component with data and used before training to load in rules
-from a [pattern file](/usage/rule-based-matching/#entityruler-files). This
-method is typically called by [`Language.initialize`](/api/language#initialize)
-and lets you customize arguments it receives via the
-[`[initialize.components]`](/api/data-formats#config-initialize) block in the
-config.
-
-> #### Example
->
-> ```python
-> entity_ruler = nlp.add_pipe("entity_ruler")
-> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
-> ```
->
-> ```ini
-> ### config.cfg
-> [initialize.components.entity_ruler]
->
-> [initialize.components.entity_ruler.patterns]
-> @readers = "srsly.read_jsonl.v1"
-> path = "corpus/entity_ruler_patterns.jsonl
-> ```
-
-| Name           | Description                                                                                                                                                          |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
-| _keyword-only_ |                                                                                                                                                                      |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                 |
-| `patterns`     | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~                                                        |
-
-## EntityRuler.\_\_len\_\_ {id="len",tag="method"}
-
-The number of all patterns added to the entity ruler.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> assert len(ruler) == 0
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-> assert len(ruler) == 1
-> ```
-
-| Name        | Description                     |
-| ----------- | ------------------------------- |
-| **RETURNS** | The number of patterns. ~~int~~ |
-
-## EntityRuler.\_\_contains\_\_ {id="contains",tag="method"}
-
-Whether a label is present in the patterns.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-> assert "ORG" in ruler
-> assert not "PERSON" in ruler
-> ```
-
-| Name        | Description                                           |
-| ----------- | ----------------------------------------------------- |
-| `label`     | The label to check. ~~str~~                           |
-| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ |
-
-## EntityRuler.\_\_call\_\_ {id="call",tag="method"}
-
-Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
-happens automatically after the component has been added to the pipeline using
-[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
-with `overwrite_ents=True`, existing entities will be replaced if they overlap
-with the matches. When matches overlap in a Doc, the entity ruler prioritizes
-longer patterns over shorter, and if equal the match occurring first in the Doc
-is chosen.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
->
-> doc = nlp("A text about Apple.")
-> ents = [(ent.text, ent.label_) for ent in doc.ents]
-> assert ents == [("Apple", "ORG")]
-> ```
-
-| Name        | Description                                                          |
-| ----------- | -------------------------------------------------------------------- |
-| `doc`       | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
-| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~        |
+## Migrating from v3 {#migrating}
 
-## EntityRuler.add_patterns {id="add_patterns",tag="method"}
+### Loading patterns
 
-Add patterns to the entity ruler. A pattern can either be a token pattern (list
-of dicts) or a phrase pattern (string). For more details, see the usage guide on
-[rule-based matching](/usage/rule-based-matching).
+Unlike the v3 `EntityRuler`, the `SpanRuler` cannot load patterns on
+initialization with `SpanRuler(patterns=patterns)` or directly from a JSONL file
+path with `SpanRuler.from_disk(jsonl_path)`. Patterns should be loaded from the
+JSONL file separately and then added through
+[`SpanRuler.initialize`](/api/spanruler#initialize]) or
+[`SpanRuler.add_patterns`](/api/spanruler#add_patterns).
 
-> #### Example
->
-> ```python
-> patterns = [
->     {"label": "ORG", "pattern": "Apple"},
->     {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}
-> ]
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns(patterns)
-> ```
-
-| Name       | Description                                                      |
-| ---------- | ---------------------------------------------------------------- |
-| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
-
-## EntityRuler.remove {id="remove",tag="method",version="3.2.1"}
-
-Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if
-the ID does not exist.
-
-> #### Example
->
-> ```python
-> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}]
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns(patterns)
-> ruler.remove("apple")
-> ```
-
-| Name | Description                         |
-| ---- | ----------------------------------- |
-| `id` | The ID of the pattern rule. ~~str~~ |
-
-## EntityRuler.to_disk {id="to_disk",tag="method"}
-
-Save the entity ruler patterns to a directory. The patterns will be saved as
-newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided,
-only the patterns are saved as JSONL. If a directory name is provided, a
-`patterns.jsonl` and `cfg` file with the component configuration is exported.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.to_disk("/path/to/patterns.jsonl")  # saves patterns only
-> ruler.to_disk("/path/to/entity_ruler")    # saves patterns and config
-> ```
-
-| Name   | Description                                                                                                                                              |
-| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-
-## EntityRuler.from_disk {id="from_disk",tag="method"}
-
-Load the entity ruler from a path. Expects either a file containing
-newline-delimited JSON (JSONL) with one entry per line, or a directory
-containing a `patterns.jsonl` file and a `cfg` file with the component
-configuration.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.from_disk("/path/to/patterns.jsonl")  # loads patterns only
-> ruler.from_disk("/path/to/entity_ruler")    # loads patterns and config
-> ```
-
-| Name        | Description                                                                                                   |
-| ----------- | ------------------------------------------------------------------------------------------------------------- |
-| `path`      | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~                                                            |
-
-## EntityRuler.to_bytes {id="to_bytes",tag="method"}
-
-Serialize the entity ruler patterns to a bytestring.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler_bytes = ruler.to_bytes()
-> ```
-
-| Name        | Description                        |
-| ----------- | ---------------------------------- |
-| **RETURNS** | The serialized patterns. ~~bytes~~ |
-
-## EntityRuler.from_bytes {id="from_bytes",tag="method"}
-
-Load the pipe from a bytestring. Modifies the object in place and returns it.
-
-> #### Example
->
-> ```python
-> ruler_bytes = ruler.to_bytes()
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.from_bytes(ruler_bytes)
-> ```
-
-| Name         | Description                                        |
-| ------------ | -------------------------------------------------- |
-| `bytes_data` | The bytestring to load. ~~bytes~~                  |
-| **RETURNS**  | The modified `EntityRuler` object. ~~EntityRuler~~ |
-
-## EntityRuler.labels {id="labels",tag="property"}
-
-All labels present in the match patterns.
-
-| Name        | Description                            |
-| ----------- | -------------------------------------- |
-| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.from_disk("patterns.jsonl")
++ import srsly
++ patterns = srsly.read_jsonl("patterns.jsonl")
++ ruler.add_patterns(patterns)
+```
 
-## EntityRuler.ent_ids {id="ent_ids",tag="property",version="2.2.2"}
+### Saving patterns
 
-All entity IDs present in the `id` properties of the match patterns.
+`SpanRuler.to_disk` always saves the full component data to a directory and does
+not include an option to save the patterns to a single JSONL file.
 
-| Name        | Description                         |
-| ----------- | ----------------------------------- |
-| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.to_disk("patterns.jsonl")
++ import srsly
++ srsly.write_jsonl("patterns.jsonl", ruler.patterns)
+```
 
-## EntityRuler.patterns {id="patterns",tag="property"}
+### Accessing token and phrase patterns
 
-Get all patterns that were added to the entity ruler.
+The separate token patterns and phrase patterns are no longer accessible under
+`ruler.token_patterns` or `ruler.phrase_patterns`. You can access the combined
+patterns in their original format using the property
+[`SpanRuler.patterns`](/api/spanruler#patterns).
 
-| Name        | Description                                                                              |
-| ----------- | ---------------------------------------------------------------------------------------- |
-| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ |
+### Removing patterns by ID
 
-## Attributes {id="attributes"}
+[`SpanRuler.remove`](/api/spanruler#remove) removes by label rather than ID. To
+remove by ID, use [`SpanRuler.remove_by_id`](/api/spanruler#remove_by_id):
 
-| Name              | Description                                                                                                           |
-| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `matcher`         | The underlying matcher used to process token patterns. ~~Matcher~~                                                    |
-| `phrase_matcher`  | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~                                      |
-| `token_patterns`  | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
-| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~                             |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.remove("id")
++ ruler.remove_by_id("id")
+```

From 4d1fb671d53a5d9a51f978e14bae078fcdadfca1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 1 Mar 2023 16:00:02 +0100
Subject: [PATCH 427/504] Return Tuple[Span] for all Doc/Span attrs that
 provide spans (#12288)

* Return Tuple[Span] for all Doc/Span attrs that provide spans

* Update Span types
---
 spacy/tokens/doc.pyi     |   4 +-
 spacy/tokens/span.pyx    | 136 +++++++--------------------------------
 website/docs/api/doc.mdx |  23 ++++---
 3 files changed, 35 insertions(+), 128 deletions(-)

diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 5fda6f2f789..bdbd653e518 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -155,9 +155,9 @@ class Doc:
         default: str = ...
     ) -> None: ...
     @property
-    def noun_chunks(self) -> Iterator[Span]: ...
+    def noun_chunks(self) -> Tuple[Span]: ...
     @property
-    def sents(self) -> Iterator[Span]: ...
+    def sents(self) -> Tuple[Span]: ...
     @property
     def lang(self) -> int: ...
     @property
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 5b4f929028a..c9cef2bcdaa 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -7,13 +7,12 @@ import numpy
 from thinc.api import get_array_module
 
 from ..attrs cimport *
-from ..attrs cimport attr_id_t
+from ..attrs cimport ORTH, attr_id_t
 from ..lexeme cimport Lexeme
-from ..parts_of_speech cimport univ_pos_t
-from ..structs cimport LexemeC, TokenC
+from ..structs cimport TokenC
 from ..symbols cimport dep
-from ..typedefs cimport attr_t
-from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
+from ..typedefs cimport attr_t, hash_t
+from .doc cimport _get_lca_matrix, get_token_attr
 from .token cimport Token
 
 from ..errors import Errors, Warnings
@@ -115,7 +114,6 @@ cdef class Span:
             end_char = start_char
         else:
             end_char = doc[end - 1].idx + len(doc[end - 1])
-        self.c = make_shared[SpanC](SpanC(
         self.c = make_shared[SpanC](SpanC(
             label=label,
             kb_id=kb_id,
@@ -125,7 +123,6 @@ cdef class Span:
             start_char=start_char,
             end_char=end_char,
         ))
-        ))
         self._vector = vector
         self._vector_norm = vector_norm
 
@@ -141,11 +138,9 @@ cdef class Span:
         # <
         if op == 0:
             return span_c.start_char < other_span_c.start_char
-            return span_c.start_char < other_span_c.start_char
         # <=
         elif op == 1:
             return span_c.start_char <= other_span_c.start_char
-            return span_c.start_char <= other_span_c.start_char
         # ==
         elif op == 2:
             # Do the cheap comparisons first
@@ -156,14 +151,6 @@ cdef class Span:
                 (span_c.kb_id == other_span_c.kb_id) and \
                 (self.doc == other.doc)
             )
-            # Do the cheap comparisons first
-            return (
-                (span_c.start_char == other_span_c.start_char) and \
-                (span_c.end_char == other_span_c.end_char) and \
-                (span_c.label == other_span_c.label) and \
-                (span_c.kb_id == other_span_c.kb_id) and \
-                (self.doc == other.doc)
-            )
         # !=
         elif op == 3:
             # Do the cheap comparisons first
@@ -174,22 +161,12 @@ cdef class Span:
                 (span_c.kb_id == other_span_c.kb_id) and \
                 (self.doc == other.doc)
             )
-            # Do the cheap comparisons first
-            return not (
-                (span_c.start_char == other_span_c.start_char) and \
-                (span_c.end_char == other_span_c.end_char) and \
-                (span_c.label == other_span_c.label) and \
-                (span_c.kb_id == other_span_c.kb_id) and \
-                (self.doc == other.doc)
-            )
         # >
         elif op == 4:
             return span_c.start_char > other_span_c.start_char
-            return span_c.start_char > other_span_c.start_char
         # >=
         elif op == 5:
             return span_c.start_char >= other_span_c.start_char
-            return span_c.start_char >= other_span_c.start_char
 
     def __hash__(self):
         return hash(self._cmp_tuple())
@@ -215,12 +192,9 @@ cdef class Span:
         DOCS: https://spacy.io/api/span#len
         """
         cdef SpanC* span_c = self.span_c()
-        if span_c.end < span_c.start:
-        cdef SpanC* span_c = self.span_c()
         if span_c.end < span_c.start:
             return 0
         return span_c.end - span_c.start
-        return span_c.end - span_c.start
 
     def __repr__(self):
         return self.text
@@ -235,18 +209,14 @@ cdef class Span:
         DOCS: https://spacy.io/api/span#getitem
         """
         cdef SpanC* span_c = self.span_c()
-        cdef SpanC* span_c = self.span_c()
         if isinstance(i, slice):
             start, end = normalize_slice(len(self), i.start, i.stop, i.step)
             return Span(self.doc, start + self.start, end + self.start)
         else:
             if i < 0:
                 token_i = span_c.end + i
-                token_i = span_c.end + i
             else:
                 token_i = span_c.start + i
-            if span_c.start <= token_i < span_c.end:
-                token_i = span_c.start + i
             if span_c.start <= token_i < span_c.end:
                 return self.doc[token_i]
             else:
@@ -260,8 +230,6 @@ cdef class Span:
         DOCS: https://spacy.io/api/span#iter
         """
         cdef SpanC* span_c = self.span_c()
-        for i in range(span_c.start, span_c.end):
-        cdef SpanC* span_c = self.span_c()
         for i in range(span_c.start, span_c.end):
             yield self.doc[i]
 
@@ -270,8 +238,8 @@ cdef class Span:
 
     @property
     def _(self):
-        """Custom extension attributes registered via `set_extension`."""
         cdef SpanC* span_c = self.span_c()
+        """Custom extension attributes registered via `set_extension`."""
         return Underscore(Underscore.span_extensions, self,
                           start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
 
@@ -357,7 +325,6 @@ cdef class Span:
         cdef attr_t value
         cdef int i, head_col, ancestor_i
         cdef SpanC* span_c = self.span_c()
-        cdef SpanC* span_c = self.span_c()
         old_to_new_root = dict()
         if HEAD in attrs:
             head_col = attrs.index(HEAD)
@@ -365,7 +332,6 @@ cdef class Span:
                 # if the HEAD refers to a token outside this span, find a more appropriate ancestor
                 token = self[i]
                 ancestor_i = token.head.i - span_c.start   # span offset
-                ancestor_i = token.head.i - span_c.start   # span offset
                 if ancestor_i not in range(length):
                     if DEP in attrs:
                         array[i, attrs.index(DEP)] = dep
@@ -373,7 +339,6 @@ cdef class Span:
                     # try finding an ancestor within this span
                     ancestors = token.ancestors
                     for ancestor in ancestors:
-                        ancestor_i = ancestor.i - span_c.start
                         ancestor_i = ancestor.i - span_c.start
                         if ancestor_i in range(length):
                             array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
@@ -405,8 +370,6 @@ cdef class Span:
         """
         cdef SpanC* span_c = self.span_c()
         return numpy.asarray(_get_lca_matrix(self.doc, span_c.start, span_c.end))
-        cdef SpanC* span_c = self.span_c()
-        return numpy.asarray(_get_lca_matrix(self.doc, span_c.start, span_c.end))
 
     def similarity(self, other):
         """Make a semantic similarity estimate. The default estimate is cosine
@@ -519,9 +482,6 @@ cdef class Span:
     cdef SpanC* span_c(self):
         return self.c.get()
 
-    cdef SpanC* span_c(self):
-        return self.c.get()
-
     @property
     def sents(self):
         """Obtain the sentences that contain this span. If the given span
@@ -560,13 +520,13 @@ cdef class Span:
                     start = i
                     if start >= self.end:
                         break
-                elif i == self.doc.length - 1:
-                    spans.append(Span(self.doc, start, self.doc.length))
+            if start < self.end:
+                spans.append(Span(self.doc, start, self.end))
+        return tuple(spans)
 
             # Ensure that trailing parts of the Span instance are included in last element of .sents.
             if start == self.doc.length - 1:
-                spans.append(Span(self.doc, start, self.doc.length))
-        return tuple(spans)
+                yield Span(self.doc, start, self.doc.length)
 
     @property
     def ents(self):
@@ -580,14 +540,9 @@ cdef class Span:
         cdef Span ent
         cdef SpanC* span_c = self.span_c()
         cdef SpanC* ent_span_c
-        cdef SpanC* span_c = self.span_c()
-        cdef SpanC* ent_span_c
         ents = []
         for ent in self.doc.ents:
             ent_span_c = ent.span_c()
-            if ent_span_c.start >= span_c.start:
-                if ent_span_c.end <= span_c.end:
-            ent_span_c = ent.span_c()
             if ent_span_c.start >= span_c.start:
                 if ent_span_c.end <= span_c.end:
                     ents.append(ent)
@@ -714,12 +669,10 @@ cdef class Span:
         # 'gov'. But we went with 'head' elsewhere, and now we're stuck =/
         cdef int i
         cdef SpanC* span_c = self.span_c()
-        cdef SpanC* span_c = self.span_c()
         # First, we scan through the Span, and check whether there's a word
         # with head==0, i.e. a sentence root. If so, we can return it. The
         # longer the span, the more likely it contains a sentence root, and
         # in this case we return in linear time.
-        for i in range(span_c.start, span_c.end):
         for i in range(span_c.start, span_c.end):
             if self.doc.c[i].head == 0:
                 return self.doc[i]
@@ -731,8 +684,6 @@ cdef class Span:
         # think this should be okay.
         cdef int current_best = self.doc.length
         cdef int root = -1
-        for i in range(span_c.start, span_c.end):
-            if span_c.start <= (i+self.doc.c[i].head) < span_c.end:
         for i in range(span_c.start, span_c.end):
             if span_c.start <= (i+self.doc.c[i].head) < span_c.end:
                 continue
@@ -742,7 +693,6 @@ cdef class Span:
                 root = i
         if root == -1:
             return self.doc[span_c.start]
-            return self.doc[span_c.start]
         else:
             return self.doc[root]
 
@@ -768,10 +718,6 @@ cdef class Span:
         start_idx += span_c.start_char
         end_idx += span_c.start_char
         return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
-        cdef SpanC* span_c = self.span_c()
-        start_idx += span_c.start_char
-        end_idx += span_c.start_char
-        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
 
     @property
     def conjuncts(self):
@@ -852,72 +798,42 @@ cdef class Span:
     property start:
         def __get__(self):
             return self.span_c().start
-            return self.span_c().start
 
         def __set__(self, int start):
-            if start < 0 or start > self.doc.length:
-                raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
-            cdef SpanC* span_c = self.span_c()
-            if start > span_c.end:
-                raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
-            span_c.start = start
-            span_c.start_char = self.doc.c[start].idx
+            if start < 0:
+                raise IndexError("TODO")
+            self.span_c().start = start
 
     property end:
         def __get__(self):
             return self.span_c().end
-            return self.span_c().end
 
         def __set__(self, int end):
-            if end < 0 or end > self.doc.length:
-                raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
-            cdef SpanC* span_c = self.span_c()
-            if span_c.start > end:
-                raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
-            span_c.end = end
-            if end > 0:
-                span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length
-            else:
-                span_c.end_char = 0
+            if end < 0:
+                raise IndexError("TODO")
+            self.span_c().end = end
 
     property start_char:
         def __get__(self):
             return self.span_c().start_char
-            return self.span_c().start_char
 
         def __set__(self, int start_char):
-            if start_char < 0 or start_char > len(self.doc.text):
-                raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
-            cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
-            if start < 0:
-                raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
-            cdef SpanC* span_c = self.span_c()
-            if start_char > span_c.end_char:
-                raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char))
-            span_c.start_char = start_char
-            span_c.start = start
+            if start_char < 0:
+                raise IndexError("TODO")
+            self.span_c().start_char = start_char
 
     property end_char:
         def __get__(self):
             return self.span_c().end_char
-            return self.span_c().end_char
 
         def __set__(self, int end_char):
-            if end_char < 0 or end_char > len(self.doc.text):
-                raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
-            cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
-            if end < 0:
-                raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
-            cdef SpanC* span_c = self.span_c()
-            if span_c.start_char > end_char:
-                raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char))
-            span_c.end_char = end_char
-            span_c.end = end
+            if end_char < 0:
+                raise IndexError("TODO")
+            self.span_c().end_char = end_char
 
     property label:
         def __get__(self):
             return self.span_c().label
-            return self.span_c().label
 
         def __set__(self, attr_t label):
             if label != self.span_c().label :
@@ -930,7 +846,6 @@ cdef class Span:
     property kb_id:
         def __get__(self):
             return self.span_c().kb_id
-            return self.span_c().kb_id
 
         def __set__(self, attr_t kb_id):
             if kb_id != self.span_c().kb_id :
@@ -943,7 +858,6 @@ cdef class Span:
     property id:
         def __get__(self):
             return self.span_c().id
-            return self.span_c().id
 
         def __set__(self, attr_t id):
             if id != self.span_c().id :
@@ -954,16 +868,12 @@ cdef class Span:
                 Underscore._replace_keys(old, new)
 
     property ent_id:
-        """Alias for the span's ID."""
         """Alias for the span's ID."""
         def __get__(self):
             return self.id
-            return self.id
 
         def __set__(self, attr_t ent_id):
             self.id = ent_id
-        def __set__(self, attr_t ent_id):
-            self.id = ent_id
 
     @property
     def orth_(self):
@@ -979,7 +889,6 @@ cdef class Span:
         return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
 
     property label_:
-        """The span's label."""
         """The span's label."""
         def __get__(self):
             return self.doc.vocab.strings[self.label]
@@ -989,7 +898,6 @@ cdef class Span:
         self.label = self.doc.vocab.strings.add(label_)
 
     property kb_id_:
-        """The span's KB ID."""
         """The span's KB ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.kb_id]
@@ -999,7 +907,6 @@ cdef class Span:
         self.kb_id = self.doc.vocab.strings.add(kb_id_)
 
     property id_:
-        """The span's ID."""
         """The span's ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.id]
@@ -1017,6 +924,7 @@ cdef class Span:
             self.id_ = ent_id_
 
 
+
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
     # better candidates
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index 310ce0dc88d..c9735b65eac 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -653,11 +653,10 @@ the [`TextCategorizer`](/api/textcategorizer).
 
 ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
 
-Iterate over the base noun phrases in the document. Yields base noun-phrase
-`Span` objects, if the document has been syntactically parsed. A base noun
-phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
-nested within it – so no NP-level coordination, no prepositional phrases, and no
-relative clauses.
+Returns a tuple of the base noun phrases in the doc, if the document has been
+syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
+does not permit other NPs to be nested within it – so no NP-level coordination,
+no prepositional phrases, and no relative clauses.
 
 To customize the noun chunk iterator in a loaded pipeline, modify
 [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
@@ -674,13 +673,13 @@ implemented for the given language, a `NotImplementedError` is raised.
 > assert chunks[1].text == "another phrase"
 > ```
 
-| Name       | Description                           |
-| ---------- | ------------------------------------- |
-| **YIELDS** | Noun chunks in the document. ~~Span~~ |
+| Name        | Description                                  |
+| ----------- | -------------------------------------------- |
+| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
 
 ## Doc.sents {id="sents",tag="property",model="sentences"}
 
-Iterate over the sentences in the document. Sentence spans have no label.
+Returns a tuple of the sentences in the document. Sentence spans have no label.
 
 This property is only available when
 [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
@@ -696,9 +695,9 @@ will raise an error otherwise.
 > assert [s.root.text for s in sents] == ["is", "'s"]
 > ```
 
-| Name       | Description                         |
-| ---------- | ----------------------------------- |
-| **YIELDS** | Sentences in the document. ~~Span~~ |
+| Name        | Description                                |
+| ----------- | ------------------------------------------ |
+| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
 
 ## Doc.has_vector {id="has_vector",tag="property",model="vectors"}
 

From 829779d7e0c1f87b0eb5adfdc7e8a80de80a7df6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 12 Jun 2023 16:16:03 +0200
Subject: [PATCH 428/504] Remove Python 3.7 builds

---
 .github/workflows/tests.yml               | 62 +++++++++++------------
 .github/workflows/universe_validation.yml |  3 +-
 requirements.txt                          |  6 +--
 3 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index af115e817e9..9d5f7a8e09a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -30,7 +30,8 @@ jobs:
       - name: Configure Python version
         uses: actions/setup-python@v4
         with:
-          python-version: "3.7"
+          python-version: "3.8"
+          architecture: x64
 
       - name: black
         run: |
@@ -59,11 +60,9 @@ jobs:
         os: [ubuntu-latest, windows-latest, macos-latest]
         python_version: ["3.12"]
         include:
-          - os: windows-latest
-            python_version: "3.7"
           - os: macos-latest
             python_version: "3.8"
-          - os: ubuntu-latest
+          - os: ubuntu-20.04
             python_version: "3.9"
           - os: windows-latest
             python_version: "3.10"
@@ -93,7 +92,6 @@ jobs:
       - name: Run mypy
         run: |
           python -m mypy spacy
-        if: matrix.python_version != '3.7'
 
       - name: Delete source directory and .egg-info
         run: |
@@ -115,22 +113,22 @@ jobs:
       - name: Test import
         run: python -W error -c "import spacy"
 
-      - name: "Test download CLI"
-        run: |
-          python -m spacy download ca_core_news_sm
-          python -m spacy download ca_core_news_md
-          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
-
-      - name: "Test download_url in info CLI"
-        run: |
-          python -W error -m spacy info ca_core_news_sm | grep -q download_url
-        if: matrix.python_version == '3.9'
-
-      - name: "Test no warnings on load (#11713)"
-        run: |
-          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
+      #      - name: "Test download CLI"
+      #        run: |
+      #          python -m spacy download ca_core_news_sm
+      #          python -m spacy download ca_core_news_md
+      #          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test download_url in info CLI"
+      #        run: |
+      #          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test no warnings on load (#11713)"
+      #        run: |
+      #          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+      #        if: matrix.python_version == '3.9'
 
       - name: "Test convert CLI"
         run: |
@@ -154,17 +152,17 @@ jobs:
           python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
         if: matrix.python_version == '3.9'
 
-      - name: "Test assemble CLI"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-        if: matrix.python_version == '3.9'
-
-      - name: "Test assemble CLI vectors warning"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-        if: matrix.python_version == '3.9'
+      #      - name: "Test assemble CLI"
+      #        run: |
+      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      #          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+      #        if: matrix.python_version == '3.9'
+      #
+      #      - name: "Test assemble CLI vectors warning"
+      #        run: |
+      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+      #          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+      #        if: matrix.python_version == '3.9'
 
       - name: "Install test requirements"
         run: |
diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml
index ce7df49dbae..b2a10258a8b 100644
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@@ -25,7 +25,8 @@ jobs:
       - name: Configure Python version
         uses: actions/setup-python@v4
         with:
-          python-version: "3.7"
+          python-version: "3.8"
+          architecture: x64
 
       - name: Validate website/meta/universe.json
         run: |
diff --git a/requirements.txt b/requirements.txt
index 94a9d17c0c3..fe695a445c5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
 # Our libraries
-spacy-legacy>=4.0.0.dev1,<4.1.0
+spacy-legacy>=4.0.0.dev0,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.2.2,<8.3.0
+thinc>=9.0.0.dev2,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
@@ -30,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
+mypy>=0.990,<1.1.0; platform_machine != "aarch64"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests

From 0a1c31cd62960b4382143fd9c975f2877c181c00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 12 Jan 2022 13:38:52 +0100
Subject: [PATCH 429/504] Span/SpanGroup: wrap SpanC in shared_ptr (#9869)

* Span/SpanGroup: wrap SpanC in shared_ptr

When a Span that was retrieved from a SpanGroup was modified, these
changes were not reflected in the SpanGroup because the underlying
SpanC struct was copied.

This change applies the solution proposed by @nrodnova, to wrap SpanC in
a shared_ptr. This makes a SpanGroup and Spans derived from it share the
same SpanC. So, changes made through a Span are visible in the SpanGroup
as well.

Fixes #9556

* Test that a SpanGroup is modified through its Spans

* SpanGroup.push_back: remove nogil

Modifying std::vector is not thread-safe.

* C++ >= 11 does not allow const T in vector<T>

* Add Span.span_c as a shorthand for Span.c.get

Since this method is cdef'ed, it is only visible from Cython, so we
avoid using raw pointers in Python

Replace existing uses of span.c.get() to use this new method.

* Fix formatting

* Style fix: pointer types

* SpanGroup.to_bytes: reduce number of shared_ptr::get calls

* Mark SpanGroup modification test with issue

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/_parser_internals/ner.pyx | 17 ++-----
 spacy/tests/doc/test_span.py             | 11 +---
 spacy/tokens/span.pxd                    |  1 +
 spacy/tokens/span.pyx                    | 64 ++++++++++--------------
 4 files changed, 33 insertions(+), 60 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 324a497c9fb..ab522b1db79 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -4,6 +4,10 @@ from libc.stdint cimport int32_t
 from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
 from cymem.cymem cimport Pool
+from libc.stdint cimport int32_t
+from libcpp.memory cimport shared_ptr
+from libcpp.vector cimport vector
+from cymem.cymem cimport Pool
 from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
 
@@ -54,7 +58,6 @@ MOVE_NAMES[OUT] = 'O'
 cdef struct GoldNERStateC:
     Transition* ner
     vector[shared_ptr[SpanC]] negs
-    vector[shared_ptr[SpanC]] negs
 
 
 cdef class BiluoGold:
@@ -100,8 +103,6 @@ cdef GoldNERStateC create_gold_state(
     # In order to handle negative samples, we need to maintain the full
     # (start, end, label) triple. If we break it down to the 'isnt B-LOC'
     # thing, we'll get blocked if there's an incorrect prefix.
-    for neg in negs:
-        gs.negs.push_back(neg.c)
     for neg in negs:
         gs.negs.push_back(neg.c)
     return gs
@@ -420,8 +421,6 @@ cdef class Begin:
 
         cdef shared_ptr[SpanC] span
 
-        cdef shared_ptr[SpanC] span
-
         if g_act == MISSING:
             pass
         elif g_act == BEGIN:
@@ -439,8 +438,6 @@ cdef class Begin:
             # be correct or not. However, we can at least tell whether we're
             # going to be opening an entity where there's only one possible
             # L.
-            for span in gold.negs:
-                if span.get().label == label and span.get().start == b0:
             for span in gold.negs:
                 if span.get().label == label and span.get().start == b0:
                     cost += 1
@@ -584,9 +581,6 @@ cdef class Last:
         # by marking actions that close an entity that we know is incorrect
         # as costly.
         cdef shared_ptr[SpanC] span
-        for span in gold.negs:
-            if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
-        cdef shared_ptr[SpanC] span
         for span in gold.negs:
             if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
                 cost += 1
@@ -653,9 +647,6 @@ cdef class Unit:
         # action
         cdef int b0 = s.B(0)
         cdef shared_ptr[SpanC] span
-        for span in gold.negs:
-            if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
-        cdef shared_ptr[SpanC] span
         for span in gold.negs:
             if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
                 cost += 1
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index ccea3d120cf..0b05ca7c123 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -6,6 +6,7 @@
 from spacy.attrs import LENGTH, ORTH
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span, SpanGroup, Token
+from spacy.vocab import Vocab
 from spacy.util import filter_spans
 from spacy.vocab import Vocab
 
@@ -163,16 +164,6 @@ def test_char_span(doc, i_sent, i, j, text):
         assert span.text == text
 
 
-@pytest.mark.issue(9556)
-def test_modify_span_group(doc):
-    group = SpanGroup(doc, spans=doc.ents)
-    for span in group:
-        span.start = 0
-        span.label = doc.vocab.strings["TEST"]
-
-    # Span changes must be reflected in the span group
-    assert group[0].start == 0
-    assert group[0].label == doc.vocab.strings["TEST"]
 @pytest.mark.issue(9556)
 def test_modify_span_group(doc):
     group = SpanGroup(doc, spans=doc.ents)
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index fb592e68bd8..68f722a13cb 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,3 +1,4 @@
+from libcpp.memory cimport shared_ptr
 cimport numpy as np
 from libcpp.memory cimport shared_ptr
 
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index c9cef2bcdaa..b2ba23ca2a9 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -2,17 +2,20 @@
 cimport numpy as np
 from libc.math cimport sqrt
 from libcpp.memory cimport make_shared
+from libc.math cimport sqrt
+from libcpp.memory cimport make_shared
 
 import numpy
 from thinc.api import get_array_module
 
 from ..attrs cimport *
-from ..attrs cimport ORTH, attr_id_t
+from ..attrs cimport attr_id_t
 from ..lexeme cimport Lexeme
-from ..structs cimport TokenC
+from ..parts_of_speech cimport univ_pos_t
+from ..structs cimport LexemeC, TokenC
 from ..symbols cimport dep
-from ..typedefs cimport attr_t, hash_t
-from .doc cimport _get_lca_matrix, get_token_attr
+from ..typedefs cimport attr_t
+from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
 from .token cimport Token
 
 from ..errors import Errors, Warnings
@@ -133,8 +136,9 @@ cdef class Span:
             else:
                 return True
 
-        self_tuple = self._cmp_tuple()
-        other_tuple = other._cmp_tuple()
+        cdef SpanC* span_c = self.span_c()
+        cdef SpanC* other_span_c = other.span_c()
+
         # <
         if op == 0:
             return span_c.start_char < other_span_c.start_char
@@ -169,20 +173,8 @@ cdef class Span:
             return span_c.start_char >= other_span_c.start_char
 
     def __hash__(self):
-        return hash(self._cmp_tuple())
-
-    def _cmp_tuple(self):
         cdef SpanC* span_c = self.span_c()
-        return (
-            span_c.start_char,
-            span_c.end_char,
-            span_c.start,
-            span_c.end,
-            span_c.label,
-            span_c.kb_id,
-            span_c.id,
-            self.doc,
-        )
+        return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id))
 
     def __len__(self):
         """Get the number of tokens in the span.
@@ -240,8 +232,9 @@ cdef class Span:
     def _(self):
         cdef SpanC* span_c = self.span_c()
         """Custom extension attributes registered via `set_extension`."""
+        cdef SpanC* span_c = self.span_c()
         return Underscore(Underscore.span_extensions, self,
-                          start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                          start=span_c.start_char, end=span_c.end_char)
 
     def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None):
         """Create a `Doc` object with a copy of the `Span`'s data.
@@ -520,13 +513,13 @@ cdef class Span:
                     start = i
                     if start >= self.end:
                         break
-            if start < self.end:
-                spans.append(Span(self.doc, start, self.end))
-        return tuple(spans)
+                elif i == self.doc.length - 1:
+                    spans.append(Span(self.doc, start, self.doc.length))
 
             # Ensure that trailing parts of the Span instance are included in last element of .sents.
             if start == self.doc.length - 1:
-                yield Span(self.doc, start, self.doc.length)
+                spans.append(Span(self.doc, start, self.doc.length))
+        return tuple(spans)
 
     @property
     def ents(self):
@@ -836,28 +829,19 @@ cdef class Span:
             return self.span_c().label
 
         def __set__(self, attr_t label):
-            if label != self.span_c().label :
-                old_label = self.span_c().label
-                self.span_c().label = label
-                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
-                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id)
-                Underscore._replace_keys(old, new)
+            self.span_c().label = label
 
     property kb_id:
         def __get__(self):
             return self.span_c().kb_id
 
         def __set__(self, attr_t kb_id):
-            if kb_id != self.span_c().kb_id :
-                old_kb_id = self.span_c().kb_id
-                self.span_c().kb_id = kb_id
-                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
-                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id)
-                Underscore._replace_keys(old, new)
+            self.span_c().kb_id = kb_id
 
     property id:
         def __get__(self):
             return self.span_c().id
+            return self.span_c().id
 
         def __set__(self, attr_t id):
             if id != self.span_c().id :
@@ -868,12 +852,16 @@ cdef class Span:
                 Underscore._replace_keys(old, new)
 
     property ent_id:
+        """Alias for the span's ID."""
         """Alias for the span's ID."""
         def __get__(self):
             return self.id
+            return self.id
 
         def __set__(self, attr_t ent_id):
             self.id = ent_id
+        def __set__(self, attr_t ent_id):
+            self.id = ent_id
 
     @property
     def orth_(self):
@@ -889,6 +877,7 @@ cdef class Span:
         return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
 
     property label_:
+        """The span's label."""
         """The span's label."""
         def __get__(self):
             return self.doc.vocab.strings[self.label]
@@ -898,6 +887,7 @@ cdef class Span:
         self.label = self.doc.vocab.strings.add(label_)
 
     property kb_id_:
+        """The span's KB ID."""
         """The span's KB ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.kb_id]
@@ -907,6 +897,7 @@ cdef class Span:
         self.kb_id = self.doc.vocab.strings.add(kb_id_)
 
     property id_:
+        """The span's ID."""
         """The span's ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.id]
@@ -924,7 +915,6 @@ cdef class Span:
             self.id_ = ent_id_
 
 
-
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
     # better candidates

From 0609911eee4d3a726a17c290b7cc8742078a6fcd Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 15 Jul 2022 11:14:08 +0200
Subject: [PATCH 430/504] `Morphology`/`Morphologizer` optimizations and
 refactoring (#11024)

* `Morphology`: Refactor to use C types, reduce allocations, remove unused code

* `Morphologzier`: Avoid unnecessary sorting of morpho features

* `Morphologizer`: Remove execessive reallocations of labels, improve hash lookups of labels, coerce `numpy` numeric types to native ints
Update docs

* Remove unused method

* Replace `unique_ptr` usage with `shared_ptr`

* Add type annotations to internal Python methods, rename `hash` variable, fix typos

* Add comment to clarify implementation detail

* Fix return type

* `Morphology`: Stop early when splitting fields and values
---
 spacy/morphology.pxd               | 13 +-----
 spacy/morphology.pyx               | 75 +++---------------------------
 spacy/tokens/morphanalysis.pxd     | 10 ++--
 spacy/tokens/morphanalysis.pyx     |  7 +++
 spacy/tokens/token.pyx             |  1 +
 website/docs/api/morphologizer.mdx |  4 +-
 6 files changed, 20 insertions(+), 90 deletions(-)

diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 122ab4f1ab4..ab8f854497b 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,8 +1,8 @@
 cimport numpy as np
 from libc.stdint cimport uint32_t, uint64_t
-from libcpp.memory cimport shared_ptr
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.memory cimport shared_ptr
 
 from .strings cimport StringStore
 from .structs cimport MorphAnalysisC
@@ -28,7 +28,6 @@ cdef cppclass MorphAnalysisC:
 cdef class Morphology:
     cdef readonly StringStore strings
     cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags
-    cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags
 
     cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash)
     cdef void _intern_morph_tag(self, hash_t tag_key, feats)
@@ -36,18 +35,8 @@ cdef class Morphology:
     cdef str _normalize_features(self, features)
     cdef str get_morph_str(self, hash_t morph_key)
     cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key)
-    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash)
-    cdef void _intern_morph_tag(self, hash_t tag_key, feats)
-    cdef hash_t _add(self, features)
-    cdef str _normalize_features(self, features)
-    cdef str get_morph_str(self, hash_t morph_key)
-    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key)
 
 cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil
 cdef list list_features(const shared_ptr[MorphAnalysisC] morph)
 cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field)
 cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil
-cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil
-cdef list list_features(const shared_ptr[MorphAnalysisC] morph)
-cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field)
-cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 5d5fa0369f8..57e4e7d10a3 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,12 +1,10 @@
 # cython: infer_types
 import warnings
-from typing import Dict, List, Optional, Tuple, Union
-
-import numpy
-
+from typing import Union, Tuple, List, Dict, Optional
 from cython.operator cimport dereference as deref
 from libcpp.memory cimport shared_ptr
 
+from .errors import Warnings
 from . import symbols
 from .errors import Warnings
 
@@ -80,13 +78,15 @@ cdef class Morphology:
         out.sort(key=lambda x: x[0])
         return dict(out)
 
+
     def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
         norm_feats_string = self.FEATURE_SEP.join([
-            self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
+                self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
             for field, values in feats.items()
-            ])
+        ])
         return norm_feats_string or self.EMPTY_MORPH
 
+
     cdef hash_t _add(self, features):
         """Insert a morphological analysis in the morphology table, if not
         already present. The morphological analysis may be provided in the UD
@@ -95,22 +95,10 @@ cdef class Morphology:
         """
         cdef hash_t tag_hash = 0
         cdef shared_ptr[MorphAnalysisC] tag
-        cdef hash_t tag_hash = 0
-        cdef shared_ptr[MorphAnalysisC] tag
         if isinstance(features, str):
             if features == "":
                 features = self.EMPTY_MORPH
 
-            tag_hash = self.strings[features]
-            tag = self._lookup_tag(tag_hash)
-            if tag:
-                return deref(tag).key
-
-            features = self._str_to_normalized_feat_dict(features)
-        elif isinstance(features, dict):
-            features = self._dict_to_normalized_feat_dict(features)
-        else:
-
             tag_hash = self.strings[features]
             tag = self._lookup_tag(tag_hash)
             if tag:
@@ -123,7 +111,6 @@ cdef class Morphology:
             warnings.warn(Warnings.W100.format(feature=features))
             features = {}
 
-
         # the hash key for the tag is either the hash of the normalized UFEATS
         # string or the hash of an empty placeholder
         norm_feats_string = self._normalized_feat_dict_to_str(features)
@@ -178,17 +165,6 @@ cdef class Morphology:
     cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
         return self._lookup_tag(morph_key)
 
-    cdef str _normalize_features(self, features):
-    cdef str get_morph_str(self, hash_t morph_key):
-        cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key)
-        if not tag:
-            return ""
-        else:
-            return self.strings[deref(tag).key]
-
-    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
-        return self._lookup_tag(morph_key)
-
     cdef str _normalize_features(self, features):
         """Create a normalized FEATS string from a features string or dict.
 
@@ -199,10 +175,6 @@ cdef class Morphology:
             features = self._str_to_normalized_feat_dict(features)
         elif isinstance(features, dict):
             features = self._dict_to_normalized_feat_dict(features)
-        else:
-            features = self._str_to_normalized_feat_dict(features)
-        elif isinstance(features, dict):
-            features = self._dict_to_normalized_feat_dict(features)
         else:
             warnings.warn(Warnings.W100.format(feature=features))
             features = {}
@@ -215,22 +187,10 @@ cdef class Morphology:
     def get(self, morph_key):
         return self.get_morph_str(morph_key)
 
-    def normalize_features(self, features):
-        return self._normalize_features(features)
-
-        return self._normalized_feat_dict_to_str(features)
-
-    def add(self, features):
-        return self._add(features)
-
-    def get(self, morph_key):
-        return self.get_morph_str(morph_key)
-
     def normalize_features(self, features):
         return self._normalize_features(features)
 
     @staticmethod
-    def feats_to_dict(feats, *, sort_values=True):
     def feats_to_dict(feats, *, sort_values=True):
         if not feats or feats == Morphology.EMPTY_MORPH:
             return {}
@@ -246,17 +206,6 @@ cdef class Morphology:
             out[field] = values
         return out
 
-        out = {}
-        for feat in feats.split(Morphology.FEATURE_SEP):
-            field, values = feat.split(Morphology.FIELD_SEP, 1)
-            if sort_values:
-                values = values.split(Morphology.VALUE_SEP)
-                values.sort()
-                values = Morphology.VALUE_SEP.join(values)
-
-            out[field] = values
-        return out
-
     @staticmethod
     def dict_to_feats(feats_dict):
         if len(feats_dict) == 0:
@@ -264,43 +213,31 @@ cdef class Morphology:
         return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
 
 
-cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
 cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
     cdef int i
-    for i in range(deref(morph).features.size()):
-        if deref(morph).features[i].value == feature:
     for i in range(deref(morph).features.size()):
         if deref(morph).features[i].value == feature:
             return True
     return False
 
 
-cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
 cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
     cdef int i
     features = []
-    for i in range(deref(morph).features.size()):
-        features.append(deref(morph).features[i].value)
     for i in range(deref(morph).features.size()):
         features.append(deref(morph).features[i].value)
     return features
 
 
-cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
-    cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
 cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
     cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
     n = get_n_by_field(<uint64_t*>results.data, morph, field)
     return results[:n]
 
 
-cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
 cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
     cdef int n_results = 0
     cdef int i
-    for i in range(deref(morph).features.size()):
-        if deref(morph).features[i].field == field:
-            results[n_results] = deref(morph).features[i].value
     for i in range(deref(morph).features.size()):
         if deref(morph).features[i].field == field:
             results[n_results] = deref(morph).features[i].value
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
index 11fc535e7ca..f866488ecc2 100644
--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@@ -1,8 +1,7 @@
-from libcpp.memory cimport shared_ptr
-
-from ..morphology cimport MorphAnalysisC
-from ..typedefs cimport hash_t
 from ..vocab cimport Vocab
+from ..typedefs cimport hash_t
+from ..morphology cimport MorphAnalysisC
+from libcpp.memory cimport shared_ptr
 
 
 cdef class MorphAnalysis:
@@ -11,6 +10,3 @@ cdef class MorphAnalysis:
     cdef shared_ptr[MorphAnalysisC] c
 
     cdef void _init_c(self, hash_t key)
-    cdef shared_ptr[MorphAnalysisC] c
-
-    cdef void _init_c(self, hash_t key)
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index 80033dd8657..be9f32c99d3 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -9,6 +9,13 @@ from libcpp.memory cimport shared_ptr
 from ..morphology cimport MorphAnalysisC, check_feature, get_by_field, list_features
 from ..typedefs cimport attr_t, hash_t
 from ..vocab cimport Vocab
+from ..typedefs cimport hash_t, attr_t
+from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
+from libcpp.memory cimport shared_ptr
+from cython.operator cimport dereference as deref
+
+
+cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
 
 
 cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index fb63b3bf959..ec79f19cf20 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -40,6 +40,7 @@ from .. import parts_of_speech
 from ..attrs import IOB_STRINGS
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
+from cython.operator cimport dereference as deref
 
 from cython.operator cimport dereference as deref
 
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 1fda807cb32..9fa4feb18d7 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -400,8 +400,8 @@ coarse-grained POS as the feature `POS`.
 > assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels
 > ```
 
-| Name        | Description                                               |
-| ----------- | --------------------------------------------------------- |
+| Name        | Description                                            |
+| ----------- | ------------------------------------------------------ |
 | **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ |
 
 ## Morphologizer.label_data {id="label_data",tag="property",version="3"}

From af26015febf3156564d64f4141ff8c796894deb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 29 Jul 2022 15:12:19 +0200
Subject: [PATCH 431/504] precompute_hiddens/Parser: look up CPU ops once (v4)
 (#11068)

* precompute_hiddens/Parser: look up CPU ops once

* precompute_hiddens: make cpu_ops private
---
 spacy/pipeline/transition_parser.pyx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 41c95c94747..632616db759 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -159,6 +159,7 @@ class Parser(TrainablePipe):
         self._rehearsal_model = None
         self.scorer = scorer
         self._cpu_ops = get_ops("cpu") if isinstance(self.model.ops, CupyOps) else self.model.ops
+        self._cpu_ops = get_ops("cpu") if isinstance(self.model.ops, CupyOps) else self.model.ops
 
     def __getnewargs_ex__(self):
         """This allows pickling the Parser and its keyword-only init arguments"""
@@ -400,7 +401,9 @@ class Parser(TrainablePipe):
         return states_or_beams
 
     def greedy_parse(self, docs, drop=0.):
-        self._resize()
+        cdef vector[StateC*] states
+        cdef StateClass state
+        cdef CBlas cblas = self._cpu_ops.cblas()
         self._ensure_labels_are_added(docs)
         with _change_attrs(self.model, beam_width=1):
             inputs = TransitionModelInputs(docs=docs, moves=self.moves)

From ea5c3ae6accfae3e9c66f7699d436d2b0d70ab04 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 10 Aug 2022 11:44:05 +0200
Subject: [PATCH 432/504] Rename modules for consistency (#11286)

* rename Python module to entity_ruler

* rename Python module to attribute_ruler
---
 spacy/pipeline/__init__.py            |   2 +-
 spacy/pipeline/entity_ruler.py        | 541 ++++++++++++++++++++++++++
 website/docs/api/entityruler.mdx      | 298 ++++++++++++--
 website/docs/usage/saving-loading.mdx |   6 +-
 4 files changed, 806 insertions(+), 41 deletions(-)
 create mode 100644 spacy/pipeline/entity_ruler.py

diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 4d1f0d90663..d54e190cc81 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -1,9 +1,9 @@
 from .attribute_ruler import AttributeRuler
-from .attribute_ruler import AttributeRuler
 from .dep_parser import DependencyParser
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
 from .ner import EntityRecognizer
+from .entity_ruler import EntityRuler
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .ner import EntityRecognizer
diff --git a/spacy/pipeline/entity_ruler.py b/spacy/pipeline/entity_ruler.py
new file mode 100644
index 00000000000..3683cfc0270
--- /dev/null
+++ b/spacy/pipeline/entity_ruler.py
@@ -0,0 +1,541 @@
+import warnings
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
+import srsly
+
+from ..errors import Errors, Warnings
+from ..language import Language
+from ..matcher import Matcher, PhraseMatcher
+from ..matcher.levenshtein import levenshtein_compare
+from ..scorer import get_ner_prf
+from ..tokens import Doc, Span
+from ..training import Example
+from ..util import SimpleFrozenList, ensure_path, from_disk, registry, to_disk
+from .pipe import Pipe
+
+DEFAULT_ENT_ID_SEP = "||"
+PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
+
+
+@Language.factory(
+    "entity_ruler",
+    assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
+    default_config={
+        "phrase_matcher_attr": None,
+        "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
+        "validate": False,
+        "overwrite_ents": False,
+        "ent_id_sep": DEFAULT_ENT_ID_SEP,
+        "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
+    },
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
+)
+def make_entity_ruler(
+    nlp: Language,
+    name: str,
+    phrase_matcher_attr: Optional[Union[int, str]],
+    matcher_fuzzy_compare: Callable,
+    validate: bool,
+    overwrite_ents: bool,
+    ent_id_sep: str,
+    scorer: Optional[Callable],
+):
+    return EntityRuler(
+        nlp,
+        name,
+        phrase_matcher_attr=phrase_matcher_attr,
+        matcher_fuzzy_compare=matcher_fuzzy_compare,
+        validate=validate,
+        overwrite_ents=overwrite_ents,
+        ent_id_sep=ent_id_sep,
+        scorer=scorer,
+    )
+
+
+def entity_ruler_score(examples, **kwargs):
+    return get_ner_prf(examples)
+
+
+@registry.scorers("spacy.entity_ruler_scorer.v1")
+def make_entity_ruler_scorer():
+    return entity_ruler_score
+
+
+class EntityRuler(Pipe):
+    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
+    rules or exact phrase matches. It can be combined with the statistical
+    `EntityRecognizer` to boost accuracy, or used on its own to implement a
+    purely rule-based entity recognition system. After initialization, the
+    component is typically added to the pipeline using `nlp.add_pipe`.
+
+    DOCS: https://spacy.io/api/entityruler
+    USAGE: https://spacy.io/usage/rule-based-matching#entityruler
+    """
+
+    def __init__(
+        self,
+        nlp: Language,
+        name: str = "entity_ruler",
+        *,
+        phrase_matcher_attr: Optional[Union[int, str]] = None,
+        matcher_fuzzy_compare: Callable = levenshtein_compare,
+        validate: bool = False,
+        overwrite_ents: bool = False,
+        ent_id_sep: str = DEFAULT_ENT_ID_SEP,
+        patterns: Optional[List[PatternType]] = None,
+        scorer: Optional[Callable] = entity_ruler_score,
+    ) -> None:
+        """Initialize the entity ruler. If patterns are supplied here, they
+        need to be a list of dictionaries with a `"label"` and `"pattern"`
+        key. A pattern can either be a token pattern (list) or a phrase pattern
+        (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
+
+        nlp (Language): The shared nlp object to pass the vocab to the matchers
+            and process phrase patterns.
+        name (str): Instance name of the current pipeline component. Typically
+            passed in automatically from the factory when the component is
+            added. Used to disable the current entity ruler while creating
+            phrase patterns with the nlp object.
+        phrase_matcher_attr (int / str): Token attribute to match on, passed
+            to the internal PhraseMatcher as `attr`.
+        matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
+            internal Matcher. Defaults to
+            spacy.matcher.levenshtein.levenshtein_compare.
+        validate (bool): Whether patterns should be validated, passed to
+            Matcher and PhraseMatcher as `validate`
+        patterns (iterable): Optional patterns to load in.
+        overwrite_ents (bool): If existing entities are present, e.g. entities
+            added by the model, overwrite them by matches if necessary.
+        ent_id_sep (str): Separator used internally for entity IDs.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            spacy.scorer.get_ner_prf.
+
+        DOCS: https://spacy.io/api/entityruler#init
+        """
+        self.nlp = nlp
+        self.name = name
+        self.overwrite = overwrite_ents
+        self.token_patterns = defaultdict(list)  # type: ignore
+        self.phrase_patterns = defaultdict(list)  # type: ignore
+        self._validate = validate
+        self.matcher_fuzzy_compare = matcher_fuzzy_compare
+        self.matcher = Matcher(
+            nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare
+        )
+        self.phrase_matcher_attr = phrase_matcher_attr
+        self.phrase_matcher = PhraseMatcher(
+            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
+        )
+        self.ent_id_sep = ent_id_sep
+        self._ent_ids = defaultdict(tuple)  # type: ignore
+        if patterns is not None:
+            self.add_patterns(patterns)
+        self.scorer = scorer
+
+    def __len__(self) -> int:
+        """The number of all patterns added to the entity ruler."""
+        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
+        n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
+        return n_token_patterns + n_phrase_patterns
+
+    def __contains__(self, label: str) -> bool:
+        """Whether a label is present in the patterns."""
+        return label in self.token_patterns or label in self.phrase_patterns
+
+    def __call__(self, doc: Doc) -> Doc:
+        """Find matches in document and add them as entities.
+
+        doc (Doc): The Doc object in the pipeline.
+        RETURNS (Doc): The Doc with added entities, if available.
+
+        DOCS: https://spacy.io/api/entityruler#call
+        """
+        error_handler = self.get_error_handler()
+        try:
+            matches = self.match(doc)
+            self.set_annotations(doc, matches)
+            return doc
+        except Exception as e:
+            return error_handler(self.name, self, [doc], e)
+
+    def match(self, doc: Doc):
+        self._require_patterns()
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", message="\\[W036")
+            matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
+
+        final_matches = set(
+            [(m_id, start, end) for m_id, start, end in matches if start != end]
+        )
+        get_sort_key = lambda m: (m[2] - m[1], -m[1])
+        final_matches = sorted(final_matches, key=get_sort_key, reverse=True)
+        return final_matches
+
+    def set_annotations(self, doc, matches):
+        """Modify the document in place"""
+        entities = list(doc.ents)
+        new_entities = []
+        seen_tokens = set()
+        for match_id, start, end in matches:
+            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
+                continue
+            # check for end - 1 here because boundaries are inclusive
+            if start not in seen_tokens and end - 1 not in seen_tokens:
+                if match_id in self._ent_ids:
+                    label, ent_id = self._ent_ids[match_id]
+                    span = Span(doc, start, end, label=label, span_id=ent_id)
+                else:
+                    span = Span(doc, start, end, label=match_id)
+                new_entities.append(span)
+                entities = [
+                    e for e in entities if not (e.start < end and e.end > start)
+                ]
+                seen_tokens.update(range(start, end))
+        doc.ents = entities + new_entities
+
+    @property
+    def labels(self) -> Tuple[str, ...]:
+        """All labels present in the match patterns.
+
+        RETURNS (set): The string labels.
+
+        DOCS: https://spacy.io/api/entityruler#labels
+        """
+        keys = set(self.token_patterns.keys())
+        keys.update(self.phrase_patterns.keys())
+        all_labels = set()
+
+        for l in keys:
+            if self.ent_id_sep in l:
+                label, _ = self._split_label(l)
+                all_labels.add(label)
+            else:
+                all_labels.add(l)
+        return tuple(sorted(all_labels))
+
+    def initialize(
+        self,
+        get_examples: Callable[[], Iterable[Example]],
+        *,
+        nlp: Optional[Language] = None,
+        patterns: Optional[Sequence[PatternType]] = None,
+    ):
+        """Initialize the pipe for training.
+
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects.
+        nlp (Language): The current nlp object the component is part of.
+        patterns Optional[Iterable[PatternType]]: The list of patterns.
+
+        DOCS: https://spacy.io/api/entityruler#initialize
+        """
+        self.clear()
+        if patterns:
+            self.add_patterns(patterns)  # type: ignore[arg-type]
+
+    @property
+    def ent_ids(self) -> Tuple[Optional[str], ...]:
+        """All entity ids present in the match patterns `id` properties
+
+        RETURNS (set): The string entity ids.
+
+        DOCS: https://spacy.io/api/entityruler#ent_ids
+        """
+        keys = set(self.token_patterns.keys())
+        keys.update(self.phrase_patterns.keys())
+        all_ent_ids = set()
+
+        for l in keys:
+            if self.ent_id_sep in l:
+                _, ent_id = self._split_label(l)
+                all_ent_ids.add(ent_id)
+        return tuple(all_ent_ids)
+
+    @property
+    def patterns(self) -> List[PatternType]:
+        """Get all patterns that were added to the entity ruler.
+
+        RETURNS (list): The original patterns, one dictionary per pattern.
+
+        DOCS: https://spacy.io/api/entityruler#patterns
+        """
+        all_patterns = []
+        for label, patterns in self.token_patterns.items():
+            for pattern in patterns:
+                ent_label, ent_id = self._split_label(label)
+                p = {"label": ent_label, "pattern": pattern}
+                if ent_id:
+                    p["id"] = ent_id
+                all_patterns.append(p)
+        for label, patterns in self.phrase_patterns.items():
+            for pattern in patterns:
+                ent_label, ent_id = self._split_label(label)
+                p = {"label": ent_label, "pattern": pattern.text}
+                if ent_id:
+                    p["id"] = ent_id
+                all_patterns.append(p)
+        return all_patterns
+
+    def add_patterns(self, patterns: List[PatternType]) -> None:
+        """Add patterns to the entity ruler. A pattern can either be a token
+        pattern (list of dicts) or a phrase pattern (string). For example:
+        {'label': 'ORG', 'pattern': 'Apple'}
+        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
+
+        patterns (list): The patterns to add.
+
+        DOCS: https://spacy.io/api/entityruler#add_patterns
+        """
+
+        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
+        try:
+            current_index = -1
+            for i, (name, pipe) in enumerate(self.nlp.pipeline):
+                if self == pipe:
+                    current_index = i
+                    break
+            subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
+        except ValueError:
+            subsequent_pipes = []
+        with self.nlp.select_pipes(disable=subsequent_pipes):
+            token_patterns = []
+            phrase_pattern_labels = []
+            phrase_pattern_texts = []
+            phrase_pattern_ids = []
+            for entry in patterns:
+                if isinstance(entry["pattern"], str):
+                    phrase_pattern_labels.append(entry["label"])
+                    phrase_pattern_texts.append(entry["pattern"])
+                    phrase_pattern_ids.append(entry.get("id"))
+                elif isinstance(entry["pattern"], list):
+                    token_patterns.append(entry)
+            phrase_patterns = []
+            for label, pattern, ent_id in zip(
+                phrase_pattern_labels,
+                self.nlp.pipe(phrase_pattern_texts),
+                phrase_pattern_ids,
+            ):
+                phrase_pattern = {"label": label, "pattern": pattern}
+                if ent_id:
+                    phrase_pattern["id"] = ent_id
+                phrase_patterns.append(phrase_pattern)
+            for entry in token_patterns + phrase_patterns:  # type: ignore[operator]
+                label = entry["label"]  # type: ignore
+                if "id" in entry:
+                    ent_label = label
+                    label = self._create_label(label, entry["id"])
+                    key = self.matcher._normalize_key(label)
+                    self._ent_ids[key] = (ent_label, entry["id"])
+                pattern = entry["pattern"]  # type: ignore
+                if isinstance(pattern, Doc):
+                    self.phrase_patterns[label].append(pattern)
+                    self.phrase_matcher.add(label, [pattern])  # type: ignore
+                elif isinstance(pattern, list):
+                    self.token_patterns[label].append(pattern)
+                    self.matcher.add(label, [pattern])
+                else:
+                    raise ValueError(Errors.E097.format(pattern=pattern))
+
+    def clear(self) -> None:
+        """Reset all patterns."""
+        self.token_patterns = defaultdict(list)
+        self.phrase_patterns = defaultdict(list)
+        self._ent_ids = defaultdict(tuple)
+        self.matcher = Matcher(
+            self.nlp.vocab,
+            validate=self._validate,
+            fuzzy_compare=self.matcher_fuzzy_compare,
+        )
+        self.phrase_matcher = PhraseMatcher(
+            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
+        )
+
+    def remove(self, ent_id: str) -> None:
+        """Remove a pattern by its ent_id if a pattern with this ent_id was added before
+
+        ent_id (str): id of the pattern to be removed
+        RETURNS: None
+        DOCS: https://spacy.io/api/entityruler#remove
+        """
+        label_id_pairs = [
+            (label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id
+        ]
+        if not label_id_pairs:
+            raise ValueError(
+                Errors.E1024.format(attr_type="ID", label=ent_id, component=self.name)
+            )
+        created_labels = [
+            self._create_label(label, eid) for (label, eid) in label_id_pairs
+        ]
+        # remove the patterns from self.phrase_patterns
+        self.phrase_patterns = defaultdict(
+            list,
+            {
+                label: val
+                for (label, val) in self.phrase_patterns.items()
+                if label not in created_labels
+            },
+        )
+        # remove the patterns from self.token_pattern
+        self.token_patterns = defaultdict(
+            list,
+            {
+                label: val
+                for (label, val) in self.token_patterns.items()
+                if label not in created_labels
+            },
+        )
+        # remove the patterns from self.token_pattern
+        for label in created_labels:
+            if label in self.phrase_matcher:
+                self.phrase_matcher.remove(label)
+            else:
+                self.matcher.remove(label)
+
+    def _require_patterns(self) -> None:
+        """Raise a warning if this component has no patterns defined."""
+        if len(self) == 0:
+            warnings.warn(Warnings.W036.format(name=self.name))
+
+    def _split_label(self, label: str) -> Tuple[str, Optional[str]]:
+        """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
+
+        label (str): The value of label in a pattern entry
+        RETURNS (tuple): ent_label, ent_id
+        """
+        if self.ent_id_sep in label:
+            ent_label, ent_id = label.rsplit(self.ent_id_sep, 1)
+        else:
+            ent_label = label
+            ent_id = None  # type: ignore
+        return ent_label, ent_id
+
+    def _create_label(self, label: Any, ent_id: Any) -> str:
+        """Join Entity label with ent_id if the pattern has an `id` attribute
+        If ent_id is not a string, the label is returned as is.
+
+        label (str): The label to set for ent.label_
+        ent_id (str): The label
+        RETURNS (str): The ent_label joined with configured `ent_id_sep`
+        """
+        if isinstance(ent_id, str):
+            label = f"{label}{self.ent_id_sep}{ent_id}"
+        return label
+
+    def from_bytes(
+        self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
+    ) -> "EntityRuler":
+        """Load the entity ruler from a bytestring.
+
+        patterns_bytes (bytes): The bytestring to load.
+        RETURNS (EntityRuler): The loaded entity ruler.
+
+        DOCS: https://spacy.io/api/entityruler#from_bytes
+        """
+        cfg = srsly.msgpack_loads(patterns_bytes)
+        self.clear()
+        if isinstance(cfg, dict):
+            self.add_patterns(cfg.get("patterns", cfg))
+            self.overwrite = cfg.get("overwrite", False)
+            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
+            self.phrase_matcher = PhraseMatcher(
+                self.nlp.vocab,
+                attr=self.phrase_matcher_attr,
+            )
+            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
+        else:
+            self.add_patterns(cfg)
+        return self
+
+    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
+        """Serialize the entity ruler patterns to a bytestring.
+
+        RETURNS (bytes): The serialized patterns.
+
+        DOCS: https://spacy.io/api/entityruler#to_bytes
+        """
+        serial = {
+            "overwrite": self.overwrite,
+            "ent_id_sep": self.ent_id_sep,
+            "phrase_matcher_attr": self.phrase_matcher_attr,
+            "patterns": self.patterns,
+        }
+        return srsly.msgpack_dumps(serial)
+
+    def from_disk(
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
+    ) -> "EntityRuler":
+        """Load the entity ruler from a file. Expects a file containing
+        newline-delimited JSON (JSONL) with one entry per line.
+
+        path (str / Path): The JSONL file to load.
+        RETURNS (EntityRuler): The loaded entity ruler.
+
+        DOCS: https://spacy.io/api/entityruler#from_disk
+        """
+        path = ensure_path(path)
+        self.clear()
+        depr_patterns_path = path.with_suffix(".jsonl")
+        if path.suffix == ".jsonl":  # user provides a jsonl
+            if path.is_file:
+                patterns = srsly.read_jsonl(path)
+                self.add_patterns(patterns)
+            else:
+                raise ValueError(Errors.E1023.format(path=path))
+        elif depr_patterns_path.is_file():
+            patterns = srsly.read_jsonl(depr_patterns_path)
+            self.add_patterns(patterns)
+        elif path.is_dir():  # path is a valid directory
+            cfg = {}
+            deserializers_patterns = {
+                "patterns": lambda p: self.add_patterns(
+                    srsly.read_jsonl(p.with_suffix(".jsonl"))
+                )
+            }
+            deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
+            from_disk(path, deserializers_cfg, {})
+            self.overwrite = cfg.get("overwrite", False)
+            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
+            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
+
+            self.phrase_matcher = PhraseMatcher(
+                self.nlp.vocab, attr=self.phrase_matcher_attr
+            )
+            from_disk(path, deserializers_patterns, {})
+        else:  # path is not a valid directory or file
+            raise ValueError(Errors.E146.format(path=path))
+        return self
+
+    def to_disk(
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
+    ) -> None:
+        """Save the entity ruler patterns to a directory. The patterns will be
+        saved as newline-delimited JSON (JSONL).
+
+        path (str / Path): The JSONL file to save.
+
+        DOCS: https://spacy.io/api/entityruler#to_disk
+        """
+        path = ensure_path(path)
+        cfg = {
+            "overwrite": self.overwrite,
+            "phrase_matcher_attr": self.phrase_matcher_attr,
+            "ent_id_sep": self.ent_id_sep,
+        }
+        serializers = {
+            "patterns": lambda p: srsly.write_jsonl(
+                p.with_suffix(".jsonl"), self.patterns
+            ),
+            "cfg": lambda p: srsly.write_json(p, cfg),
+        }
+        if path.suffix == ".jsonl":  # user wants to save only JSONL
+            srsly.write_jsonl(path, self.patterns)
+        else:
+            to_disk(path, serializers, {})
diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx
index bc9ec050323..8a5dccd329b 100644
--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@@ -1,5 +1,7 @@
 ---
 title: EntityRuler
+tag: class
+source: spacy/pipeline/entity_ruler.py
 new: 2.1
 teaser: 'Pipeline component for rule-based named entity recognition'
 api_string_name: entity_ruler
@@ -75,51 +77,273 @@ how the component should be configured. You can override its settings via the
 | `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                       |
 | `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                 |
 
-## Migrating from v3 {#migrating}
+```python
+%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py
+```
 
-### Loading patterns
+## EntityRuler.\_\_init\_\_ {id="init",tag="method"}
 
-Unlike the v3 `EntityRuler`, the `SpanRuler` cannot load patterns on
-initialization with `SpanRuler(patterns=patterns)` or directly from a JSONL file
-path with `SpanRuler.from_disk(jsonl_path)`. Patterns should be loaded from the
-JSONL file separately and then added through
-[`SpanRuler.initialize`](/api/spanruler#initialize]) or
-[`SpanRuler.add_patterns`](/api/spanruler#add_patterns).
+Initialize the entity ruler. If patterns are supplied here, they need to be a
+list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either
+be a token pattern (list) or a phrase pattern (string). For example:
+`{"label": "ORG", "pattern": "Apple"}`.
 
-```diff
- ruler = nlp.get_pipe("entity_ruler")
-- ruler.from_disk("patterns.jsonl")
-+ import srsly
-+ patterns = srsly.read_jsonl("patterns.jsonl")
-+ ruler.add_patterns(patterns)
-```
+> #### Example
+>
+> ```python
+> # Construction via add_pipe
+> ruler = nlp.add_pipe("entity_ruler")
+>
+> # Construction from class
+> from spacy.pipeline import EntityRuler
+> ruler = EntityRuler(nlp, overwrite_ents=True)
+> ```
 
-### Saving patterns
+| Name                                                 | Description                                                                                                                                                                                                                           |
+| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `nlp`                                                | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~                                                                                                                                     |
+| `name` <Tag variant="new">3</Tag>                    | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
+| _keyword-only_                                       |                                                                                                                                                                                                                                       |
+| `phrase_matcher_attr`                                | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~                                         |
+| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~                                                                                                     |
+| `validate`                                           | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~                                                                                                                |
+| `overwrite_ents`                                     | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~                                                                                             |
+| `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                                                               |
+| `patterns`                                           | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~                                                                                                                                 |
+| `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                                                         |
 
-`SpanRuler.to_disk` always saves the full component data to a directory and does
-not include an option to save the patterns to a single JSONL file.
+## EntityRuler.initialize {id="initialize",tag="method",version="3"}
 
-```diff
- ruler = nlp.get_pipe("entity_ruler")
-- ruler.to_disk("patterns.jsonl")
-+ import srsly
-+ srsly.write_jsonl("patterns.jsonl", ruler.patterns)
-```
+Initialize the component with data and used before training to load in rules
+from a [pattern file](/usage/rule-based-matching/#entityruler-files). This
+method is typically called by [`Language.initialize`](/api/language#initialize)
+and lets you customize arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
 
-### Accessing token and phrase patterns
+> #### Example
+>
+> ```python
+> entity_ruler = nlp.add_pipe("entity_ruler")
+> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
+> ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.entity_ruler]
+>
+> [initialize.components.entity_ruler.patterns]
+> @readers = "srsly.read_jsonl.v1"
+> path = "corpus/entity_ruler_patterns.jsonl
+> ```
 
-The separate token patterns and phrase patterns are no longer accessible under
-`ruler.token_patterns` or `ruler.phrase_patterns`. You can access the combined
-patterns in their original format using the property
-[`SpanRuler.patterns`](/api/spanruler#patterns).
+| Name           | Description                                                                                                                                                          |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
+| _keyword-only_ |                                                                                                                                                                      |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                 |
+| `patterns`     | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~                                                        |
 
-### Removing patterns by ID
+## EntityRuler.\_\_len\_\_ {id="len",tag="method"}
 
-[`SpanRuler.remove`](/api/spanruler#remove) removes by label rather than ID. To
-remove by ID, use [`SpanRuler.remove_by_id`](/api/spanruler#remove_by_id):
+The number of all patterns added to the entity ruler.
 
-```diff
- ruler = nlp.get_pipe("entity_ruler")
-- ruler.remove("id")
-+ ruler.remove_by_id("id")
-```
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> assert len(ruler) == 0
+> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
+> assert len(ruler) == 1
+> ```
+
+| Name        | Description                     |
+| ----------- | ------------------------------- |
+| **RETURNS** | The number of patterns. ~~int~~ |
+
+## EntityRuler.\_\_contains\_\_ {id="contains",tag="method"}
+
+Whether a label is present in the patterns.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
+> assert "ORG" in ruler
+> assert not "PERSON" in ruler
+> ```
+
+| Name        | Description                                           |
+| ----------- | ----------------------------------------------------- |
+| `label`     | The label to check. ~~str~~                           |
+| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ |
+
+## EntityRuler.\_\_call\_\_ {id="call",tag="method"}
+
+Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
+happens automatically after the component has been added to the pipeline using
+[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
+with `overwrite_ents=True`, existing entities will be replaced if they overlap
+with the matches. When matches overlap in a Doc, the entity ruler prioritizes
+longer patterns over shorter, and if equal the match occuring first in the Doc
+is chosen.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
+>
+> doc = nlp("A text about Apple.")
+> ents = [(ent.text, ent.label_) for ent in doc.ents]
+> assert ents == [("Apple", "ORG")]
+> ```
+
+| Name        | Description                                                          |
+| ----------- | -------------------------------------------------------------------- |
+| `doc`       | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
+| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~        |
+
+## EntityRuler.add_patterns {id="add_patterns",tag="method"}
+
+Add patterns to the entity ruler. A pattern can either be a token pattern (list
+of dicts) or a phrase pattern (string). For more details, see the usage guide on
+[rule-based matching](/usage/rule-based-matching).
+
+> #### Example
+>
+> ```python
+> patterns = [
+>     {"label": "ORG", "pattern": "Apple"},
+>     {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}
+> ]
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns(patterns)
+> ```
+
+| Name       | Description                                                      |
+| ---------- | ---------------------------------------------------------------- |
+| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
+
+## EntityRuler.remove {id="remove",tag="method",version="3.2.1"}
+
+Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if
+the ID does not exist.
+
+> #### Example
+>
+> ```python
+> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}]
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.add_patterns(patterns)
+> ruler.remove("apple")
+> ```
+
+| Name | Description                         |
+| ---- | ----------------------------------- |
+| `id` | The ID of the pattern rule. ~~str~~ |
+
+## EntityRuler.to_disk {id="to_disk",tag="method"}
+
+Save the entity ruler patterns to a directory. The patterns will be saved as
+newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided,
+only the patterns are saved as JSONL. If a directory name is provided, a
+`patterns.jsonl` and `cfg` file with the component configuration is exported.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.to_disk("/path/to/patterns.jsonl")  # saves patterns only
+> ruler.to_disk("/path/to/entity_ruler")    # saves patterns and config
+> ```
+
+| Name   | Description                                                                                                                                              |
+| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+
+## EntityRuler.from_disk {id="from_disk",tag="method"}
+
+Load the entity ruler from a path. Expects either a file containing
+newline-delimited JSON (JSONL) with one entry per line, or a directory
+containing a `patterns.jsonl` file and a `cfg` file with the component
+configuration.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.from_disk("/path/to/patterns.jsonl")  # loads patterns only
+> ruler.from_disk("/path/to/entity_ruler")    # loads patterns and config
+> ```
+
+| Name        | Description                                                                                                   |
+| ----------- | ------------------------------------------------------------------------------------------------------------- |
+| `path`      | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~                                                            |
+
+## EntityRuler.to_bytes {id="to_bytes",tag="method"}
+
+Serialize the entity ruler patterns to a bytestring.
+
+> #### Example
+>
+> ```python
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler_bytes = ruler.to_bytes()
+> ```
+
+| Name        | Description                        |
+| ----------- | ---------------------------------- |
+| **RETURNS** | The serialized patterns. ~~bytes~~ |
+
+## EntityRuler.from_bytes {id="from_bytes",tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> ruler_bytes = ruler.to_bytes()
+> ruler = nlp.add_pipe("entity_ruler")
+> ruler.from_bytes(ruler_bytes)
+> ```
+
+| Name         | Description                                        |
+| ------------ | -------------------------------------------------- |
+| `bytes_data` | The bytestring to load. ~~bytes~~                  |
+| **RETURNS**  | The modified `EntityRuler` object. ~~EntityRuler~~ |
+
+## EntityRuler.labels {id="labels",tag="property"}
+
+All labels present in the match patterns.
+
+| Name        | Description                            |
+| ----------- | -------------------------------------- |
+| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ |
+
+## EntityRuler.ent_ids {id="ent_ids",tag="property",version="2.2.2"}
+
+All entity IDs present in the `id` properties of the match patterns.
+
+| Name        | Description                         |
+| ----------- | ----------------------------------- |
+| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ |
+
+## EntityRuler.patterns {id="patterns",tag="property"}
+
+Get all patterns that were added to the entity ruler.
+
+| Name        | Description                                                                              |
+| ----------- | ---------------------------------------------------------------------------------------- |
+| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ |
+
+## Attributes {id="attributes"}
+
+| Name              | Description                                                                                                           |
+| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
+| `matcher`         | The underlying matcher used to process token patterns. ~~Matcher~~                                                    |
+| `phrase_matcher`  | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~                                      |
+| `token_patterns`  | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
+| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~                             |
diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx
index 3712fbeeb80..420dc1d281e 100644
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@@ -187,9 +187,9 @@ the data to and from a JSON file.
 
 > #### Real-world example
 >
-> To see custom serialization methods in action, check out the
-> [`SpanRuler`](/api/spanruler) component and its
-> [source](%%GITHUB_SPACY/spacy/pipeline/span_ruler.py). Patterns added to the
+> To see custom serialization methods in action, check out the new
+> [`EntityRuler`](/api/entityruler) component and its
+> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the
 > component will be saved to a `.jsonl` file if the pipeline is serialized to
 > disk, and to a bytestring if the pipeline is serialized to bytes. This allows
 > saving out a pipeline with rule-based components _with_ all the component

From 3288c42c8c8b02bfa5424ceb3d1b89e9a273850b Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 22 Aug 2022 15:52:24 +0200
Subject: [PATCH 433/504] Cleanup Cython structs (#11337)

* cleanup Tokenizer fields

* remove unused object from vocab

* remove IS_OOV_DEPRECATED

* add back in as FLAG13

* FLAG 18 instead

* import fix

* fix clumpsy fingers

* revert symbol changes in favor of #11352

* bint instead of bool
---
 spacy/tokenizer.pyx | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 8485a57c8aa..2d6c879e360 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -9,11 +9,17 @@ from preshed.maps cimport PreshMap
 
 import re
 
+from .tokens.doc cimport Doc
+from .strings cimport hash_string
 from .lexeme cimport EMPTY_LEXEME
 from .strings cimport hash_string
 from .tokens.doc cimport Doc
 
+from .attrs import intify_attrs
+from .symbols import ORTH, NORM
+from .errors import Errors
 from . import util
+from .util import get_words_and_spaces
 from .attrs import intify_attrs
 from .errors import Errors
 from .scorer import Scorer
@@ -142,10 +148,8 @@ cdef class Tokenizer:
     property faster_heuristics:
         def __get__(self):
             return self._faster_heuristics
-            return self._faster_heuristics
 
         def __set__(self, faster_heuristics):
-            self._faster_heuristics = faster_heuristics
             self._faster_heuristics = faster_heuristics
             self._reload_special_cases()
 >>>>>>> 5abfa8215 (Cleanup Cython structs (#11337))

From a8184a62ecd013614fe13820120df5b7546aa5f9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 22 Aug 2022 20:28:57 +0200
Subject: [PATCH 434/504] Make Span/Doc.ents more consistent for ent_kb_id and
 ent_id (#11328)

* Map `Span.id` to `Token.ent_id` in all cases when setting `Doc.ents`
* Reset `Token.ent_id` and `Token.ent_kb_id` when setting `Doc.ents`
* Make `Span.ent_id` an alias of `Span.id` rather than a read-only view
of the root token's `ent_id` annotation
---
 spacy/tests/doc/test_span.py               | 47 ----------------------
 spacy/tokens/span.pyi                      | 10 -----
 spacy/tokens/span.pyx                      | 24 +++++------
 website/docs/api/span.mdx                  |  1 +
 website/docs/api/token.mdx                 |  1 +
 website/docs/usage/rule-based-matching.mdx | 10 ++---
 6 files changed, 16 insertions(+), 77 deletions(-)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 0b05ca7c123..74874624888 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -696,50 +696,3 @@ def test_span_ent_id(en_tokenizer):
     doc.ents = [span]
     assert doc.ents[0].ent_id_ == "ID2"
     assert doc[1].ent_id_ == "ID2"
-
-
-def test_span_start_end_sync(en_tokenizer):
-    doc = en_tokenizer("a bc def e fghij kl")
-    # can create and edit span starts/ends
-    span = doc[2:4]
-    span.start_char = 2
-    span.end = 5
-    assert span == doc[span.start : span.end]
-    assert span == doc.char_span(span.start_char, span.end_char)
-    # cannot set completely out of bounds starts/ends
-    with pytest.raises(IndexError):
-        span.start = -1
-    with pytest.raises(IndexError):
-        span.end = -1
-    with pytest.raises(IndexError):
-        span.start_char = len(doc.text) + 1
-    with pytest.raises(IndexError):
-        span.end = len(doc.text) + 1
-    # test all possible char starts/ends
-    span = doc[0 : len(doc)]
-    token_char_starts = [token.idx for token in doc]
-    token_char_ends = [token.idx + len(token.text) for token in doc]
-    for i in range(len(doc.text)):
-        if i not in token_char_starts:
-            with pytest.raises(ValueError):
-                span.start_char = i
-        else:
-            span.start_char = i
-    span = doc[0 : len(doc)]
-    for i in range(len(doc.text)):
-        if i not in token_char_ends:
-            with pytest.raises(ValueError):
-                span.end_char = i
-        else:
-            span.end_char = i
-    # start must be <= end
-    span = doc[1:3]
-    with pytest.raises(ValueError):
-        span.start = 4
-    with pytest.raises(ValueError):
-        span.end = 0
-    span = doc.char_span(2, 8)
-    with pytest.raises(ValueError):
-        span.start_char = 9
-    with pytest.raises(ValueError):
-        span.end_char = 1
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 5039b33eee2..2a529593e5f 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -134,21 +134,11 @@ class Span:
     @property
     def ent_id(self) -> int: ...
     @property
-    def label(self) -> int: ...
-    @property
-    def kb_id(self) -> int: ...
-    @property
-    def id(self) -> int: ...
-    @property
-    def ent_id(self) -> int: ...
-    @property
     def orth_(self) -> str: ...
     @property
     def lemma_(self) -> str: ...
     @property
     def label_(self) -> str: ...
-    @label_.setter
-    def label_(self, label: str): ...
     @property
     def kb_id_(self) -> str: ...
     @property
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index b2ba23ca2a9..e9bc3c7311c 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -841,27 +841,17 @@ cdef class Span:
     property id:
         def __get__(self):
             return self.span_c().id
-            return self.span_c().id
 
         def __set__(self, attr_t id):
-            if id != self.span_c().id :
-                old_id = self.span_c().id
-                self.span_c().id = id
-                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
-                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id)
-                Underscore._replace_keys(old, new)
+            self.span_c().id = id
 
     property ent_id:
-        """Alias for the span's ID."""
         """Alias for the span's ID."""
         def __get__(self):
             return self.id
-            return self.id
 
         def __set__(self, attr_t ent_id):
             self.id = ent_id
-        def __set__(self, attr_t ent_id):
-            self.id = ent_id
 
     @property
     def orth_(self):
@@ -877,7 +867,6 @@ cdef class Span:
         return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
 
     property label_:
-        """The span's label."""
         """The span's label."""
         def __get__(self):
             return self.doc.vocab.strings[self.label]
@@ -887,7 +876,6 @@ cdef class Span:
         self.label = self.doc.vocab.strings.add(label_)
 
     property kb_id_:
-        """The span's KB ID."""
         """The span's KB ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.kb_id]
@@ -897,7 +885,6 @@ cdef class Span:
         self.kb_id = self.doc.vocab.strings.add(kb_id_)
 
     property id_:
-        """The span's ID."""
         """The span's ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.id]
@@ -914,6 +901,15 @@ cdef class Span:
         def __set__(self, str ent_id_):
             self.id_ = ent_id_
 
+    property ent_id_:
+        """Alias for the span's ID."""
+        def __get__(self):
+            return self.id_
+
+        def __set__(self, str ent_id_):
+            self.id_ = ent_id_
+
+
 
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index 5d1b56daebb..7cf448f8f07 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -567,4 +567,5 @@ overlaps with will be returned.
 | `ent_id_`                               | Alias for `id_`: the span's ID. ~~str~~                                                                                       |
 | `id`                                    | The hash value of the span's ID. ~~int~~                                                                                      |
 | `id_`                                   | The span's ID. ~~str~~                                                                                                        |
+| `sentiment`                             | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
 | `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
diff --git a/website/docs/api/token.mdx b/website/docs/api/token.mdx
index 16d421c12f4..12b99394350 100644
--- a/website/docs/api/token.mdx
+++ b/website/docs/api/token.mdx
@@ -470,6 +470,7 @@ The L2 norm of the token's vector representation.
 | `lang_`                                      | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
 | `prob`                                       | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
 | `idx`                                        | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
+| `sentiment`                                  | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
 | `lex_id`                                     | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `rank`                                       | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 765b786996c..2c02b0d8ee2 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1403,17 +1403,15 @@ ruler.add_patterns(patterns)
 
 doc1 = nlp("Apple is opening its first big office in San Francisco.")
 print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents])
-print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents])
 
 doc2 = nlp("Apple is opening its first big office in San Fran.")
 print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
-print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
 ```
 
-If the `id` attribute is included in the [`entity_ruler`](/api/entityruler)
-patterns, the `id_` property of the matched entity is set to the `id` given in
-the patterns. So in the example above it's easy to identify that "San Francisco"
-and "San Fran" are both the same entity.
+If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
+patterns, the `id_` property of the matched entity is set to the `id` given
+in the patterns. So in the example above it's easy to identify that "San
+Francisco" and "San Fran" are both the same entity.
 
 ### Using pattern files {id="entityruler-files"}
 

From acac2157e54efb704678bfb1b6c19034f6faed51 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 30 Aug 2022 13:56:35 +0200
Subject: [PATCH 435/504] Make stable private modules public and adjust names
 (#11353)

* Make stable private modules public and adjust names

* `spacy.ml._character_embed` -> `spacy.ml.character_embed`
* `spacy.ml._precomputable_affine` -> `spacy.ml.precomputable_affine`
* `spacy.tokens._serialize` -> `spacy.tokens.doc_bin`
* `spacy.tokens._retokenize` -> `spacy.tokens.retokenize`
* `spacy.tokens._dict_proxies` -> `spacy.tokens.span_groups`

* Skip _precomputable_affine

* retokenize -> retokenizer

* Fix imports
---
 spacy/ml/models/tok2vec.py        |   3 +-
 spacy/pipeline/attribute_ruler.py | 357 ------------------------------
 spacy/tokens/__init__.py          |   3 +-
 spacy/tokens/doc.pyi              |  13 +-
 spacy/tokens/doc.pyx              |  15 +-
 spacy/tokens/doc_bin.py           | 310 --------------------------
 6 files changed, 23 insertions(+), 678 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 8e59e34c053..a605d32cd40 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -22,6 +22,8 @@
 from ...attrs import intify_attr
 from ...errors import Errors
 from ...ml import character_embed
+from ..staticvectors import StaticVectors
+from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
 from ...tokens import Doc
 from ...util import registry
@@ -241,7 +243,6 @@ def CharacterEmbed(
     if feature is None:
         raise ValueError(Errors.E911.format(feat=feature))
     char_embed = chain(
-        character_embed.CharacterEmbed(nM=nM, nC=nC),
         character_embed.CharacterEmbed(nM=nM, nC=nC),
         cast(Model[List[Floats2d], Ragged], list2ragged()),
     )
diff --git a/spacy/pipeline/attribute_ruler.py b/spacy/pipeline/attribute_ruler.py
index 76f82b84e38..e69de29bb2d 100644
--- a/spacy/pipeline/attribute_ruler.py
+++ b/spacy/pipeline/attribute_ruler.py
@@ -1,357 +0,0 @@
-from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
-
-import srsly
-
-from .. import util
-from ..errors import Errors
-from ..language import Language
-from ..matcher import Matcher
-from ..scorer import Scorer
-from ..symbols import IDS
-from ..tokens import Doc, Span
-from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs
-from ..training import Example
-from ..util import SimpleFrozenList, registry
-from ..vocab import Vocab
-from .pipe import Pipe
-
-MatcherPatternType = List[Dict[Union[int, str], Any]]
-AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
-TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
-MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
-
-
-@Language.factory(
-    "attribute_ruler",
-    default_config={
-        "validate": False,
-        "scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
-    },
-)
-def make_attribute_ruler(
-    nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
-):
-    return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
-
-
-def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
-    def morph_key_getter(token, attr):
-        return getattr(token, attr).key
-
-    results = {}
-    results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
-    results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
-    results.update(
-        Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)
-    )
-    results.update(
-        Scorer.score_token_attr_per_feat(
-            examples, "morph", getter=morph_key_getter, **kwargs
-        )
-    )
-    results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
-    return results
-
-
-@registry.scorers("spacy.attribute_ruler_scorer.v1")
-def make_attribute_ruler_scorer():
-    return attribute_ruler_score
-
-
-class AttributeRuler(Pipe):
-    """Set token-level attributes for tokens matched by Matcher patterns.
-    Additionally supports importing patterns from tag maps and morph rules.
-
-    DOCS: https://spacy.io/api/attributeruler
-    """
-
-    def __init__(
-        self,
-        vocab: Vocab,
-        name: str = "attribute_ruler",
-        *,
-        validate: bool = False,
-        scorer: Optional[Callable] = attribute_ruler_score,
-    ) -> None:
-        """Create the AttributeRuler. After creation, you can add patterns
-        with the `.initialize()` or `.add_patterns()` methods, or load patterns
-        with `.from_bytes()` or `.from_disk()`. Loading patterns will remove
-        any patterns you've added previously.
-
-        vocab (Vocab): The vocab.
-        name (str): The pipe name. Defaults to "attribute_ruler".
-        scorer (Optional[Callable]): The scoring method. Defaults to
-            Scorer.score_token_attr for the attributes "tag", "pos", "morph" and
-            "lemma" and Scorer.score_token_attr_per_feat for the attribute
-            "morph".
-
-        RETURNS (AttributeRuler): The AttributeRuler component.
-
-        DOCS: https://spacy.io/api/attributeruler#init
-        """
-        self.name = name
-        self.vocab = vocab
-        self.matcher = Matcher(self.vocab, validate=validate)
-        self.validate = validate
-        self.attrs: List[Dict] = []
-        self._attrs_unnormed: List[Dict] = []  # store for reference
-        self.indices: List[int] = []
-        self.scorer = scorer
-
-    def clear(self) -> None:
-        """Reset all patterns."""
-        self.matcher = Matcher(self.vocab, validate=self.validate)
-        self.attrs = []
-        self._attrs_unnormed = []
-        self.indices = []
-
-    def initialize(
-        self,
-        get_examples: Optional[Callable[[], Iterable[Example]]],
-        *,
-        nlp: Optional[Language] = None,
-        patterns: Optional[Iterable[AttributeRulerPatternType]] = None,
-        tag_map: Optional[TagMapType] = None,
-        morph_rules: Optional[MorphRulesType] = None,
-    ) -> None:
-        """Initialize the attribute ruler by adding zero or more patterns.
-
-        Rules can be specified as a sequence of dicts using the `patterns`
-        keyword argument. You can also provide rules using the "tag map" or
-        "morph rules" formats supported by spaCy prior to v3.
-        """
-        self.clear()
-        if patterns:
-            self.add_patterns(patterns)
-        if tag_map:
-            self.load_from_tag_map(tag_map)
-        if morph_rules:
-            self.load_from_morph_rules(morph_rules)
-
-    def __call__(self, doc: Doc) -> Doc:
-        """Apply the AttributeRuler to a Doc and set all attribute exceptions.
-
-        doc (Doc): The document to process.
-        RETURNS (Doc): The processed Doc.
-
-        DOCS: https://spacy.io/api/attributeruler#call
-        """
-        error_handler = self.get_error_handler()
-        try:
-            matches = self.match(doc)
-            self.set_annotations(doc, matches)
-            return doc
-        except Exception as e:
-            return error_handler(self.name, self, [doc], e)
-
-    def match(self, doc: Doc):
-        matches = self.matcher(doc, allow_missing=True, as_spans=False)
-        # Sort by the attribute ID, so that later rules have precedence
-        matches = [
-            (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches  # type: ignore
-        ]
-        matches.sort()
-        return matches
-
-    def set_annotations(self, doc, matches):
-        """Modify the document in place"""
-        for attr_id, match_id, start, end in matches:
-            span = Span(doc, start, end, label=match_id)
-            attrs = self.attrs[attr_id]
-            index = self.indices[attr_id]
-            try:
-                # The index can be negative, which makes it annoying to do
-                # the boundscheck. Let Span do it instead.
-                token = span[index]  # noqa: F841
-            except IndexError:
-                # The original exception is just our conditional logic, so we
-                # raise from.
-                raise ValueError(
-                    Errors.E1001.format(
-                        patterns=self.matcher.get(span.label),
-                        span=[t.text for t in span],
-                        index=index,
-                    )
-                ) from None
-            set_token_attrs(span[index], attrs)
-
-    def load_from_tag_map(
-        self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]]
-    ) -> None:
-        """Load attribute ruler patterns from a tag map.
-
-        tag_map (dict): The tag map that maps fine-grained tags to
-            coarse-grained tags and morphological features.
-
-        DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
-        """
-        for tag, attrs in tag_map.items():
-            pattern = [{"TAG": tag}]
-            attrs, morph_attrs = _split_morph_attrs(attrs)
-            if "MORPH" not in attrs:
-                morph = self.vocab.morphology.add(morph_attrs)
-                attrs["MORPH"] = self.vocab.strings[morph]
-            else:
-                morph = self.vocab.morphology.add(attrs["MORPH"])
-                attrs["MORPH"] = self.vocab.strings[morph]
-            self.add([pattern], attrs)  # type: ignore[list-item]
-
-    def load_from_morph_rules(
-        self, morph_rules: Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
-    ) -> None:
-        """Load attribute ruler patterns from morph rules.
-
-        morph_rules (dict): The morph rules that map token text and
-            fine-grained tags to coarse-grained tags, lemmas and morphological
-            features.
-
-        DOCS: https://spacy.io/api/attributeruler#load_from_morph_rules
-        """
-        for tag in morph_rules:
-            for word in morph_rules[tag]:
-                pattern = [{"ORTH": word, "TAG": tag}]
-                attrs = morph_rules[tag][word]
-                attrs, morph_attrs = _split_morph_attrs(attrs)
-                if "MORPH" in attrs:
-                    morph = self.vocab.morphology.add(attrs["MORPH"])
-                    attrs["MORPH"] = self.vocab.strings[morph]
-                elif morph_attrs:
-                    morph = self.vocab.morphology.add(morph_attrs)
-                    attrs["MORPH"] = self.vocab.strings[morph]
-                self.add([pattern], attrs)  # type: ignore[list-item]
-
-    def add(
-        self, patterns: Iterable[MatcherPatternType], attrs: Dict, index: int = 0
-    ) -> None:
-        """Add Matcher patterns for tokens that should be modified with the
-        provided attributes. The token at the specified index within the
-        matched span will be assigned the attributes.
-
-        patterns (Iterable[List[Dict]]): A list of Matcher patterns.
-        attrs (Dict): The attributes to assign to the target token in the
-            matched span.
-        index (int): The index of the token in the matched span to modify. May
-            be negative to index from the end of the span. Defaults to 0.
-
-        DOCS: https://spacy.io/api/attributeruler#add
-        """
-        # We need to make a string here, because otherwise the ID we pass back
-        # will be interpreted as the hash of a string, rather than an ordinal.
-        key = str(len(self.attrs))
-        self.matcher.add(self.vocab.strings.add(key), patterns)  # type: ignore[arg-type]
-        self._attrs_unnormed.append(attrs)
-        attrs = normalize_token_attrs(self.vocab, attrs)
-        self.attrs.append(attrs)
-        self.indices.append(index)
-
-    def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None:
-        """Add patterns from a list of pattern dicts with the keys as the
-        arguments to AttributeRuler.add.
-        patterns (Iterable[dict]): A list of pattern dicts with the keys
-            as the arguments to AttributeRuler.add (patterns/attrs/index) to
-            add as patterns.
-
-        DOCS: https://spacy.io/api/attributeruler#add_patterns
-        """
-        for p in patterns:
-            self.add(**p)  # type: ignore[arg-type]
-
-    @property
-    def patterns(self) -> List[AttributeRulerPatternType]:
-        """All the added patterns."""
-        all_patterns = []
-        for i in range(len(self.attrs)):
-            p = {}
-            p["patterns"] = self.matcher.get(str(i))[1]
-            p["attrs"] = self._attrs_unnormed[i]  # type: ignore
-            p["index"] = self.indices[i]  # type: ignore
-            all_patterns.append(p)
-        return all_patterns  # type: ignore[return-value]
-
-    def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
-        """Serialize the AttributeRuler to a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
-
-        DOCS: https://spacy.io/api/attributeruler#to_bytes
-        """
-        serialize = {}
-        serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
-        serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(
-        self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "AttributeRuler":
-        """Load the AttributeRuler from a bytestring.
-
-        bytes_data (bytes): The data to load.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        returns (AttributeRuler): The loaded object.
-
-        DOCS: https://spacy.io/api/attributeruler#from_bytes
-        """
-
-        def load_patterns(b):
-            self.add_patterns(srsly.msgpack_loads(b))
-
-        deserialize = {
-            "vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
-            "patterns": load_patterns,
-        }
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(
-        self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
-    ) -> None:
-        """Serialize the AttributeRuler to disk.
-
-        path (Union[Path, str]): A path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-
-        DOCS: https://spacy.io/api/attributeruler#to_disk
-        """
-        serialize = {
-            "vocab": lambda p: self.vocab.to_disk(p, exclude=exclude),
-            "patterns": lambda p: srsly.write_msgpack(p, self.patterns),
-        }
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(
-        self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "AttributeRuler":
-        """Load the AttributeRuler from disk.
-
-        path (Union[Path, str]): A path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (AttributeRuler): The loaded object.
-
-        DOCS: https://spacy.io/api/attributeruler#from_disk
-        """
-
-        def load_patterns(p):
-            self.add_patterns(srsly.read_msgpack(p))
-
-        deserialize = {
-            "vocab": lambda p: self.vocab.from_disk(p, exclude=exclude),
-            "patterns": load_patterns,
-        }
-        util.from_disk(path, deserialize, exclude)
-        return self
-
-
-def _split_morph_attrs(attrs: dict) -> Tuple[dict, dict]:
-    """Split entries from a tag map or morph rules dict into to two dicts, one
-    with the token-level features (POS, LEMMA) and one with the remaining
-    features, which are presumed to be individual MORPH features."""
-    other_attrs = {}
-    morph_attrs = {}
-    for k, v in attrs.items():
-        if k in "_" or k in IDS.keys() or k in IDS.values():
-            other_attrs[k] = v
-        else:
-            morph_attrs[k] = v
-    return other_attrs, morph_attrs
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index 7617e462fde..16c43485340 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -4,6 +4,7 @@
 from .morphanalysis import MorphAnalysis
 from .span import Span
 from .span_group import SpanGroup
-from .token import Token
+from .doc_bin import DocBin
+from .morphanalysis import MorphAnalysis
 
 __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index bdbd653e518..1304a8aae8d 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -8,7 +8,6 @@ from typing import (
     List,
     Optional,
     Protocol,
-    Sequence,
     Tuple,
     Union,
     overload,
@@ -16,21 +15,19 @@ from typing import (
 
 import numpy as np
 from cymem.cymem import Pool
-from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
+from thinc.types import Floats1d, Floats2d, Ints2d
 from .span import Span
 from .token import Token
 from .span_groups import SpanGroups
 from .retokenizer import Retokenizer
 from ..lexeme import Lexeme
 from ..vocab import Vocab
-from ._dict_proxies import SpanGroups
-from ._retokenize import Retokenizer
+from .retokenizer import Retokenizer
 from .span import Span
+from .span_groups import SpanGroups
 from .token import Token
 from .underscore import Underscore
 
-DOCBIN_ALL_ATTRS: Tuple[str, ...]
-
 class DocMethod(Protocol):
     def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ...  # type: ignore[misc]
 
@@ -40,7 +37,6 @@ class Doc:
     spans: SpanGroups
     max_length: int
     length: int
-    sentiment: float
     activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
     cats: Dict[str, float]
     user_hooks: Dict[str, Callable[..., Any]]
@@ -125,6 +121,7 @@ class Doc:
         start_idx: int,
         end_idx: int,
         label: Union[int, str] = ...,
+        *,
         kb_id: Union[int, str] = ...,
         vector: Optional[Floats1d] = ...,
         alignment_mode: str = ...,
@@ -152,7 +149,7 @@ class Doc:
         blocked: Optional[List[Span]] = ...,
         missing: Optional[List[Span]] = ...,
         outside: Optional[List[Span]] = ...,
-        default: str = ...
+        default: str = ...,
     ) -> None: ...
     @property
     def noun_chunks(self) -> Tuple[Span]: ...
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 4b8a15a65fd..169199bc563 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -20,8 +20,15 @@ from thinc.util import copy_array
 
 from .span cimport Span
 from .token cimport MISSING_DEP
-
 from .span_groups import SpanGroups
+from .token cimport Token
+from ..lexeme cimport Lexeme, EMPTY_LEXEME
+from ..typedefs cimport attr_t, flags_t
+from ..attrs cimport attr_id_t
+from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
+from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
+
+from ._dict_proxies import SpanGroups
 
 from ..attrs cimport (
     DEP,
@@ -50,6 +57,12 @@ from ..attrs import IDS, intify_attr
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
 from ..morphology import Morphology
+from .. import util
+from .. import parts_of_speech
+from .. import schemas
+from .underscore import Underscore, get_ext_args
+from .retokenizer import Retokenizer
+from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from ..util import get_words_and_spaces
 from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from .retokenizer import Retokenizer
diff --git a/spacy/tokens/doc_bin.py b/spacy/tokens/doc_bin.py
index 4dda40a05ee..e69de29bb2d 100644
--- a/spacy/tokens/doc_bin.py
+++ b/spacy/tokens/doc_bin.py
@@ -1,310 +0,0 @@
-import zlib
-from pathlib import Path
-from typing import Dict, Iterable, Iterator, List, Optional, Set, Union
-
-import numpy
-import srsly
-from numpy import ndarray
-from thinc.api import NumpyOps
-
-from ..attrs import IDS, ORTH, SPACY, intify_attr
-from ..compat import copy_reg
-from ..errors import Errors
-from ..util import SimpleFrozenList, ensure_path
-from ..vocab import Vocab
-from .doc import Doc
-from .span_groups import SpanGroups
-
-# fmt: off
-ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
-# fmt: on
-
-
-class DocBin:
-    """Pack Doc objects for binary serialization.
-
-    The DocBin class lets you efficiently serialize the information from a
-    collection of Doc objects. You can control which information is serialized
-    by passing a list of attribute IDs, and optionally also specify whether the
-    user data is serialized. The DocBin is faster and produces smaller data
-    sizes than pickle, and allows you to deserialize without executing arbitrary
-    Python code.
-
-    The serialization format is gzipped msgpack, where the msgpack object has
-    the following structure:
-
-    {
-        "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
-        "tokens": bytes, # Serialized numpy uint64 array with the token data
-        "spans": List[Dict[str, bytes]], # SpanGroups data for each doc
-        "spaces": bytes, # Serialized numpy boolean array with spaces data
-        "lengths": bytes, # Serialized numpy int32 array with the doc lengths
-        "strings": List[str] # List of unique strings in the token data
-        "version": str, # DocBin version number
-    }
-
-    Strings for the words, tags, labels etc are represented by 64-bit hashes in
-    the token data, and every string that occurs at least once is passed via the
-    strings object. This means the storage is more efficient if you pack more
-    documents together, because you have less duplication in the strings.
-
-    A notable downside to this format is that you can't easily extract just one
-    document from the DocBin.
-    """
-
-    def __init__(
-        self,
-        attrs: Iterable[str] = ALL_ATTRS,
-        store_user_data: bool = False,
-        docs: Iterable[Doc] = SimpleFrozenList(),
-    ) -> None:
-        """Create a DocBin object to hold serialized annotations.
-
-        attrs (Iterable[str]): List of attributes to serialize. 'orth' and
-            'spacy' are always serialized, so they're not required.
-        store_user_data (bool): Whether to write the `Doc.user_data` to bytes/file.
-        docs (Iterable[Doc]): Docs to add.
-
-        DOCS: https://spacy.io/api/docbin#init
-        """
-        int_attrs = [intify_attr(attr) for attr in attrs]
-        if None in int_attrs:
-            non_valid = [attr for attr in attrs if intify_attr(attr) is None]
-            raise KeyError(
-                Errors.E983.format(dict="attrs", key=non_valid, keys=IDS.keys())
-            ) from None
-        attrs = sorted(int_attrs)
-        self.version = "0.1"
-        self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
-        self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]
-        self.tokens: List[ndarray] = []
-        self.spaces: List[ndarray] = []
-        self.cats: List[Dict] = []
-        self.span_groups: List[bytes] = []
-        self.user_data: List[Optional[bytes]] = []
-        self.flags: List[Dict] = []
-        self.strings: Set[str] = set()
-        self.store_user_data = store_user_data
-        for doc in docs:
-            self.add(doc)
-
-    def __len__(self) -> int:
-        """RETURNS: The number of Doc objects added to the DocBin."""
-        return len(self.tokens)
-
-    def add(self, doc: Doc) -> None:
-        """Add a Doc's annotations to the DocBin for serialization.
-
-        doc (Doc): The Doc object to add.
-
-        DOCS: https://spacy.io/api/docbin#add
-        """
-        array = doc.to_array(self.attrs)
-        if len(array.shape) == 1:
-            array = array.reshape((array.shape[0], 1))
-        self.tokens.append(array)
-        spaces = doc.to_array(SPACY)
-        assert array.shape[0] == spaces.shape[0]  # this should never happen
-        spaces = spaces.reshape((spaces.shape[0], 1))
-        self.spaces.append(numpy.asarray(spaces, dtype=bool))
-        self.flags.append({"has_unknown_spaces": doc.has_unknown_spaces})
-        for token in doc:
-            self.strings.add(token.text)
-            self.strings.add(token.tag_)
-            self.strings.add(token.lemma_)
-            self.strings.add(token.norm_)
-            self.strings.add(str(token.morph))
-            self.strings.add(token.dep_)
-            self.strings.add(token.ent_type_)
-            self.strings.add(token.ent_kb_id_)
-            self.strings.add(token.ent_id_)
-        self.cats.append(doc.cats)
-        if self.store_user_data:
-            self.user_data.append(srsly.msgpack_dumps(doc.user_data))
-        self.span_groups.append(doc.spans.to_bytes())
-        for key, group in doc.spans.items():
-            for span in group:
-                self.strings.add(span.label_)
-                if span.kb_id in span.doc.vocab.strings:
-                    self.strings.add(span.kb_id_)
-                if span.id in span.doc.vocab.strings:
-                    self.strings.add(span.id_)
-
-    def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
-        """Recover Doc objects from the annotations, using the given vocab.
-        Note that the user data of each doc will be read (if available) and returned,
-        regardless of the setting of 'self.store_user_data'.
-
-        vocab (Vocab): The shared vocab.
-        YIELDS (Doc): The Doc objects.
-
-        DOCS: https://spacy.io/api/docbin#get_docs
-        """
-        for string in self.strings:
-            vocab[string]
-        orth_col = self.attrs.index(ORTH)
-        for i in range(len(self.tokens)):
-            flags = self.flags[i]
-            tokens = self.tokens[i]
-            spaces: Optional[ndarray] = self.spaces[i]
-            if flags.get("has_unknown_spaces"):
-                spaces = None
-            doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)  # type: ignore
-            doc = doc.from_array(self.attrs, tokens)  # type: ignore
-            doc.cats = self.cats[i]
-            # backwards-compatibility: may be b'' or serialized empty list
-            if self.span_groups[i] and self.span_groups[i] != SpanGroups._EMPTY_BYTES:
-                doc.spans.from_bytes(self.span_groups[i])
-            else:
-                doc.spans.clear()
-            if i < len(self.user_data) and self.user_data[i] is not None:
-                user_data = srsly.msgpack_loads(self.user_data[i], use_list=False)
-                doc.user_data.update(user_data)
-            yield doc
-
-    def merge(self, other: "DocBin") -> None:
-        """Extend the annotations of this DocBin with the annotations from
-        another. Will raise an error if the pre-defined attrs of the two
-        DocBins don't match, or if they differ in whether or not to store
-        user data.
-
-        other (DocBin): The DocBin to merge into the current bin.
-
-        DOCS: https://spacy.io/api/docbin#merge
-        """
-        if self.attrs != other.attrs:
-            raise ValueError(
-                Errors.E166.format(param="attrs", current=self.attrs, other=other.attrs)
-            )
-        if self.store_user_data != other.store_user_data:
-            raise ValueError(
-                Errors.E166.format(
-                    param="store_user_data",
-                    current=self.store_user_data,
-                    other=other.store_user_data,
-                )
-            )
-        self.tokens.extend(other.tokens)
-        self.spaces.extend(other.spaces)
-        self.strings.update(other.strings)
-        self.cats.extend(other.cats)
-        self.span_groups.extend(other.span_groups)
-        self.flags.extend(other.flags)
-        self.user_data.extend(other.user_data)
-
-    def to_bytes(self) -> bytes:
-        """Serialize the DocBin's annotations to a bytestring.
-
-        RETURNS (bytes): The serialized DocBin.
-
-        DOCS: https://spacy.io/api/docbin#to_bytes
-        """
-        for tokens in self.tokens:
-            assert len(tokens.shape) == 2, tokens.shape  # this should never happen
-        lengths = [len(tokens) for tokens in self.tokens]
-        tokens = numpy.vstack(self.tokens) if self.tokens else numpy.asarray([])
-        spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
-        msg = {
-            "version": self.version,
-            "attrs": self.attrs,
-            "tokens": tokens.tobytes("C"),
-            "spaces": spaces.tobytes("C"),
-            "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
-            "strings": list(sorted(self.strings)),
-            "cats": self.cats,
-            "flags": self.flags,
-            "span_groups": self.span_groups,
-        }
-        if self.store_user_data:
-            msg["user_data"] = self.user_data
-        return zlib.compress(srsly.msgpack_dumps(msg))
-
-    def from_bytes(self, bytes_data: bytes) -> "DocBin":
-        """Deserialize the DocBin's annotations from a bytestring.
-
-        bytes_data (bytes): The data to load from.
-        RETURNS (DocBin): The loaded DocBin.
-
-        DOCS: https://spacy.io/api/docbin#from_bytes
-        """
-        try:
-            msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
-        except zlib.error:
-            raise ValueError(Errors.E1014)
-        self.attrs = msg["attrs"]
-        self.strings = set(msg["strings"])
-        lengths = numpy.frombuffer(msg["lengths"], dtype="int32")
-        flat_spaces = numpy.frombuffer(msg["spaces"], dtype=bool)
-        flat_tokens = numpy.frombuffer(msg["tokens"], dtype="uint64")
-        shape = (flat_tokens.size // len(self.attrs), len(self.attrs))
-        flat_tokens = flat_tokens.reshape(shape)
-        flat_spaces = flat_spaces.reshape((flat_spaces.size, 1))
-        self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
-        self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
-        self.cats = msg["cats"]
-        self.span_groups = msg.get("span_groups", [b"" for _ in lengths])
-        self.flags = msg.get("flags", [{} for _ in lengths])
-        if "user_data" in msg:
-            self.user_data = list(msg["user_data"])
-        else:
-            self.user_data = [None] * len(self)
-        for tokens in self.tokens:
-            assert len(tokens.shape) == 2, tokens.shape  # this should never happen
-        return self
-
-    def to_disk(self, path: Union[str, Path]) -> None:
-        """Save the DocBin to a file (typically called .spacy).
-
-        path (str / Path): The file path.
-
-        DOCS: https://spacy.io/api/docbin#to_disk
-        """
-        path = ensure_path(path)
-        with path.open("wb") as file_:
-            try:
-                file_.write(self.to_bytes())
-            except ValueError:
-                raise ValueError(Errors.E870)
-
-    def from_disk(self, path: Union[str, Path]) -> "DocBin":
-        """Load the DocBin from a file (typically called .spacy).
-
-        path (str / Path): The file path.
-        RETURNS (DocBin): The loaded DocBin.
-
-        DOCS: https://spacy.io/api/docbin#to_disk
-        """
-        path = ensure_path(path)
-        with path.open("rb") as file_:
-            self.from_bytes(file_.read())
-        return self
-
-
-def merge_bins(bins):
-    merged = None
-    for byte_string in bins:
-        if byte_string is not None:
-            doc_bin = DocBin(store_user_data=True).from_bytes(byte_string)
-            if merged is None:
-                merged = doc_bin
-            else:
-                merged.merge(doc_bin)
-    if merged is not None:
-        return merged.to_bytes()
-    else:
-        return b""
-
-
-def pickle_bin(doc_bin):
-    return (unpickle_bin, (doc_bin.to_bytes(),))
-
-
-def unpickle_bin(byte_string):
-    return DocBin().from_bytes(byte_string)
-
-
-copy_reg.pickle(DocBin, pickle_bin, unpickle_bin)
-# Compatibility, as we had named it this previously.
-Binder = DocBin
-
-__all__ = ["DocBin"]

From 60b36798b5985f89b75250597b14d894cd072057 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 30 Aug 2022 22:40:31 +0900
Subject: [PATCH 436/504] Update/remove old Matcher syntax (#11370)

* Clean up old Matcher call style related stuff

In v2 Matcher.add was called with (key, on_match, *patterns). In v3 this
was changed to (key, patterns, *, on_match=None), but there were various
points where the old call syntax was documented or handled specially.
This removes all those.

The Matcher itself didn't need any code changes, as it just gives a
generic type error. However the PhraseMatcher required some changes
because it would automatically "fix" the old call style.

Surprisingly, the tokenizer was still using the old call style in one
place.

After these changes tests failed in two places:

1. one test for the "new" call style, including the "old" call style. I
   removed this test.
2. deserializing the PhraseMatcher fails because the input docs are a
   set.

I am not sure why 2 is happening - I guess it's a quirk of the
serialization format? - so for now I just convert the set to a list when
deserializing. The check that the input Docs are a List in the
PhraseMatcher is a new check, but makes it parallel with the other
Matchers, which seemed like the right thing to do.

* Add notes related to input docs / deserialization type

* Remove Typing import

* Remove old note about call style change

* Apply suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Use separate method for setting internal doc representations

In addition to the title change, this changes the internal dict to be a
defaultdict, instead of a dict with frequent use of setdefault.

* Add _add_from_arrays for unpickling

* Cleanup around adding from arrays

This moves adding to internal structures into the private batch method,
and removes the single-add method.

This has one behavioral change for `add`, in that if something is wrong
with the list of input Docs (such as one of the items not being a Doc),
valid items before the invalid one will not be added. Also the callback
will not be updated if anything is invalid. This change should not be
significant.

This also adds a test to check failure when given a non-Doc.

* Update spacy/matcher/phrasematcher.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/errors.py                 | 35 ++++++++-------------------------
 spacy/matcher/phrasematcher.pyx | 15 ++++----------
 2 files changed, 12 insertions(+), 38 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index eab1d90c0f9..75cf2545919 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -498,7 +498,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "Current DocBin: {current}\nOther DocBin: {other}")
     E169 = ("Can't find module: {module}")
     E170 = ("Cannot apply transition {name}: invalid for the current state.")
-    E171 = ("{name}.add received invalid 'on_match' callback argument: expected "
     E171 = ("{name}.add received invalid 'on_match' callback argument: expected "
             "callable or None, but got: {arg_type}")
     E175 = ("Can't remove rule for unknown match pattern ID: {key}")
@@ -751,7 +750,6 @@ class Errors(metaclass=ErrorsWithCodes):
             "loaded nlp object, but got: {source}")
     E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
             "a string value from {expected} but got: '{arg}'")
-    E948 = ("`{name}.add` received invalid 'patterns' argument: expected "
     E948 = ("`{name}.add` received invalid 'patterns' argument: expected "
             "a list, but got: {arg_type}")
     E949 = ("Unable to align tokens for the predicted and reference docs. It "
@@ -982,33 +980,16 @@ class Errors(metaclass=ErrorsWithCodes):
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
-    E4001 = ("Expected input to be one of the following types: ({expected_types}), "
-             "but got '{received_type}'")
-    E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
-    E4003 = ("Training examples for distillation must have the exact same tokens in the "
-             "reference and predicted docs.")
-    E4004 = ("Backprop is not supported when is_train is not set.")
-    E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
-    E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
-    E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
-             "{existing_value}.")
-    E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
-    E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
-    E4010 = ("Required lemmatizer table(s) {missing_tables} not found in "
-             "[initialize] or in registered lookups (spacy-lookups-data). An "
-             "example for how to load lemmatizer tables in [initialize]:\n\n"
-             "[initialize.components]\n\n"
-             "[initialize.components.{pipe_name}]\n\n"
-             "[initialize.components.{pipe_name}.lookups]\n"
-             '@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
-             "lang = ${{nlp.lang}}\n"
-             f'url = "{about.__lookups_url__}"\n'
-             "tables = {tables}\n"
-             "# or required tables only: tables = {required_tables}\n")
-    E4011 = ("Server error ({status_code}), couldn't fetch {url}")
 
 
-RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
+# Deprecated model shortcuts, only used in errors and warnings
+OLD_MODEL_SHORTCUTS = {
+    "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
+    "pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
+    "nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
+    "lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
+}
+
 
 # fmt: on
 
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index e8ad394b0d7..87da110de04 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,9 +1,8 @@
 # cython: infer_types=True, profile=True
-from collections import defaultdict
 from typing import List
-
+from collections import defaultdict
 from libc.stdint cimport uintptr_t
-from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
+from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
 
 import warnings
 
@@ -45,7 +44,6 @@ cdef class PhraseMatcher:
         self.vocab = vocab
         self._callbacks = {}
         self._docs = defaultdict(set)
-        self._docs = defaultdict(set)
         self._validate = validate
 
         self.mem = Pool()
@@ -161,28 +159,23 @@ cdef class PhraseMatcher:
         del self._callbacks[key]
         del self._docs[key]
 
+
     def _add_from_arrays(self, key, specs, *, on_match=None):
         """Add a preprocessed list of specs, with an optional callback.
 
         key (str): The match ID.
         specs (List[List[int]]): A list of lists of hashes to match.
-        specs (List[List[int]]): A list of lists of hashes to match.
         on_match (callable): Callback executed on match.
         """
-        """
         cdef MapStruct* current_node
         cdef MapStruct* internal_node
         cdef void* result
 
-        self._callbacks[key] = on_match
-        for spec in specs:
-            self._docs[key].add(tuple(spec))
         self._callbacks[key] = on_match
         for spec in specs:
             self._docs[key].add(tuple(spec))
 
             current_node = self.c_map
-            for token in spec:
             for token in spec:
                 if token == self._terminal_hash:
                     warnings.warn(Warnings.W021)
@@ -202,6 +195,7 @@ cdef class PhraseMatcher:
                 result = internal_node
             map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
 
+
     def add(self, key, docs, *, on_match=None):
         """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
         key, a list of one or more patterns, and (optionally) an on_match callback.
@@ -365,7 +359,6 @@ def unpickle_matcher(vocab, docs, callbacks, attr):
     for key, specs in docs.items():
         callback = callbacks.get(key, None)
         matcher._add_from_arrays(key, specs, on_match=callback)
-        matcher._add_from_arrays(key, specs, on_match=callback)
     return matcher
 
 

From df8cb3c1ae13b4c4897186c17157e8cce58f7a97 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 6 Oct 2022 10:51:06 +0200
Subject: [PATCH 437/504] `StringStore` refactoring (#11344)

* `strings`: Remove unused `hash32_utf8` function

* `strings`: Make `hash_utf8` and `decode_Utf8Str` private

* `strings`: Reorganize private functions

* 'strings': Raise error when non-string/-int types are passed to functions that don't accept them

* `strings`: Add `items()` method, add type hints, remove unused methods, restrict inputs to specific types, reorganize methods

* `Morphology`: Use `StringStore.items()` to enumerate features when pickling

* `test_stringstore`: Update pre-Python 3 tests

* Update `StringStore` docs

* Fix `get_string_id` imports

* Replace redundant test with tests for type checking

* Rename `_retrieve_interned_str`, remove `.get` default arg

* Add `get_string_id` to `strings.pyi`
Remove `mypy` ignore directives from imports of the above

* `strings.pyi`: Replace functions that consume `Union`-typed params with overloads

* `strings.pyi`: Revert some function signatures

* Update `SYMBOLS_BY_INT` lookups and error codes post-merge

* Revert clobbered change introduced in a previous merge

* Remove unnecessary type hint

* Invert tuple order in `StringStore.items()`

* Add test for `StringStore.items()`

* Revert "`Morphology`: Use `StringStore.items()` to enumerate features when pickling"

This reverts commit 1af9510ceb6b08cfdcfbf26df6896f26709fac0d.

* Rename `keys` and `key_map`

* Add `keys()` and `values()`

* Add comment about the inverted key-value semantics in the API

* Fix type hints

* Implement `keys()`, `values()`, `items()` without generators

* Fix type hints, remove unnecessary boxing

* Update docs

* Simplify `keys/values/items()` impl

* `mypy` fix

* Fix error message, doc fixes
---
 spacy/strings.pxd | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index c05731c9a15..688dbc46261 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,3 +1,6 @@
+from libc.stdint cimport int64_t, uint32_t
+from libcpp.vector cimport vector
+from libcpp.set cimport set
 from cymem.cymem cimport Pool
 from libc.stdint cimport int64_t, uint32_t
 from libcpp.set cimport set

From d07a93aa5b1db9ef535e7706d3c593e96083c5c7 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 21 Oct 2022 18:01:18 +0900
Subject: [PATCH 438/504] Remove thinc util reimports (#11665)

* Remove imports marked as v2 leftovers

There are a few functions that were in `spacy.util` in v2, but were
moved to Thinc. In v3 these were imported in `spacy.util` so that code
could be used unchanged, but the comment over them indicates they should
always be imported from Thinc. This commit removes those imports.

It doesn't look like any DeprecationWarning was ever thrown for using
these, but it is probably fine to remove them anyway with a major
version. It is not clear that they were widely used.

* Import fix_random_seed correctly

This seems to be the only place in spaCy that was using the old import.
---
 spacy/util.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/util.py b/spacy/util.py
index ae9837e3afe..fdc02a717cc 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -59,6 +59,9 @@
     cupy = None
 
 
+from .symbols import ORTH
+from .compat import cupy, CudaStream, is_windows, importlib_metadata
+from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
 from . import about
 from .compat import CudaStream, cupy, is_windows
 from .errors import Errors, Warnings

From 0d41e8fc3e2a8b77083a1e05d64e334d932e4fbb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 24 Oct 2022 09:11:35 +0200
Subject: [PATCH 439/504] Replace EntityRuler with SpanRuler implementation
 (#11320)

* Replace EntityRuler with SpanRuler implementation

Remove `EntityRuler` and rename the `SpanRuler`-based
`future_entity_ruler` to `entity_ruler`.

Main changes:

* It is no longer possible to load patterns on init as with
`EntityRuler(patterns=)`.
* The older serialization formats (`patterns.jsonl`) are no longer
supported and the related tests are removed.
* The config settings are only stored in the config, not in the
serialized component (in particular the `phrase_matcher_attr` and
overwrite settings).

* Add migration guide to EntityRuler API docs

* docs update

* Minor edit

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/pipeline/__init__.py                 |   1 -
 spacy/pipeline/entity_ruler.py             | 541 ---------------------
 website/docs/api/entityruler.mdx           | 298 ++----------
 website/docs/usage/rule-based-matching.mdx |   8 +-
 website/docs/usage/saving-loading.mdx      |   6 +-
 5 files changed, 44 insertions(+), 810 deletions(-)
 delete mode 100644 spacy/pipeline/entity_ruler.py

diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index d54e190cc81..e26f7436efa 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -3,7 +3,6 @@
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
 from .ner import EntityRecognizer
-from .entity_ruler import EntityRuler
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .ner import EntityRecognizer
diff --git a/spacy/pipeline/entity_ruler.py b/spacy/pipeline/entity_ruler.py
deleted file mode 100644
index 3683cfc0270..00000000000
--- a/spacy/pipeline/entity_ruler.py
+++ /dev/null
@@ -1,541 +0,0 @@
-import warnings
-from collections import defaultdict
-from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
-
-import srsly
-
-from ..errors import Errors, Warnings
-from ..language import Language
-from ..matcher import Matcher, PhraseMatcher
-from ..matcher.levenshtein import levenshtein_compare
-from ..scorer import get_ner_prf
-from ..tokens import Doc, Span
-from ..training import Example
-from ..util import SimpleFrozenList, ensure_path, from_disk, registry, to_disk
-from .pipe import Pipe
-
-DEFAULT_ENT_ID_SEP = "||"
-PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
-
-
-@Language.factory(
-    "entity_ruler",
-    assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
-    default_config={
-        "phrase_matcher_attr": None,
-        "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
-        "validate": False,
-        "overwrite_ents": False,
-        "ent_id_sep": DEFAULT_ENT_ID_SEP,
-        "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
-    },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
-)
-def make_entity_ruler(
-    nlp: Language,
-    name: str,
-    phrase_matcher_attr: Optional[Union[int, str]],
-    matcher_fuzzy_compare: Callable,
-    validate: bool,
-    overwrite_ents: bool,
-    ent_id_sep: str,
-    scorer: Optional[Callable],
-):
-    return EntityRuler(
-        nlp,
-        name,
-        phrase_matcher_attr=phrase_matcher_attr,
-        matcher_fuzzy_compare=matcher_fuzzy_compare,
-        validate=validate,
-        overwrite_ents=overwrite_ents,
-        ent_id_sep=ent_id_sep,
-        scorer=scorer,
-    )
-
-
-def entity_ruler_score(examples, **kwargs):
-    return get_ner_prf(examples)
-
-
-@registry.scorers("spacy.entity_ruler_scorer.v1")
-def make_entity_ruler_scorer():
-    return entity_ruler_score
-
-
-class EntityRuler(Pipe):
-    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
-    rules or exact phrase matches. It can be combined with the statistical
-    `EntityRecognizer` to boost accuracy, or used on its own to implement a
-    purely rule-based entity recognition system. After initialization, the
-    component is typically added to the pipeline using `nlp.add_pipe`.
-
-    DOCS: https://spacy.io/api/entityruler
-    USAGE: https://spacy.io/usage/rule-based-matching#entityruler
-    """
-
-    def __init__(
-        self,
-        nlp: Language,
-        name: str = "entity_ruler",
-        *,
-        phrase_matcher_attr: Optional[Union[int, str]] = None,
-        matcher_fuzzy_compare: Callable = levenshtein_compare,
-        validate: bool = False,
-        overwrite_ents: bool = False,
-        ent_id_sep: str = DEFAULT_ENT_ID_SEP,
-        patterns: Optional[List[PatternType]] = None,
-        scorer: Optional[Callable] = entity_ruler_score,
-    ) -> None:
-        """Initialize the entity ruler. If patterns are supplied here, they
-        need to be a list of dictionaries with a `"label"` and `"pattern"`
-        key. A pattern can either be a token pattern (list) or a phrase pattern
-        (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
-
-        nlp (Language): The shared nlp object to pass the vocab to the matchers
-            and process phrase patterns.
-        name (str): Instance name of the current pipeline component. Typically
-            passed in automatically from the factory when the component is
-            added. Used to disable the current entity ruler while creating
-            phrase patterns with the nlp object.
-        phrase_matcher_attr (int / str): Token attribute to match on, passed
-            to the internal PhraseMatcher as `attr`.
-        matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
-            internal Matcher. Defaults to
-            spacy.matcher.levenshtein.levenshtein_compare.
-        validate (bool): Whether patterns should be validated, passed to
-            Matcher and PhraseMatcher as `validate`
-        patterns (iterable): Optional patterns to load in.
-        overwrite_ents (bool): If existing entities are present, e.g. entities
-            added by the model, overwrite them by matches if necessary.
-        ent_id_sep (str): Separator used internally for entity IDs.
-        scorer (Optional[Callable]): The scoring method. Defaults to
-            spacy.scorer.get_ner_prf.
-
-        DOCS: https://spacy.io/api/entityruler#init
-        """
-        self.nlp = nlp
-        self.name = name
-        self.overwrite = overwrite_ents
-        self.token_patterns = defaultdict(list)  # type: ignore
-        self.phrase_patterns = defaultdict(list)  # type: ignore
-        self._validate = validate
-        self.matcher_fuzzy_compare = matcher_fuzzy_compare
-        self.matcher = Matcher(
-            nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare
-        )
-        self.phrase_matcher_attr = phrase_matcher_attr
-        self.phrase_matcher = PhraseMatcher(
-            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
-        )
-        self.ent_id_sep = ent_id_sep
-        self._ent_ids = defaultdict(tuple)  # type: ignore
-        if patterns is not None:
-            self.add_patterns(patterns)
-        self.scorer = scorer
-
-    def __len__(self) -> int:
-        """The number of all patterns added to the entity ruler."""
-        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
-        n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
-        return n_token_patterns + n_phrase_patterns
-
-    def __contains__(self, label: str) -> bool:
-        """Whether a label is present in the patterns."""
-        return label in self.token_patterns or label in self.phrase_patterns
-
-    def __call__(self, doc: Doc) -> Doc:
-        """Find matches in document and add them as entities.
-
-        doc (Doc): The Doc object in the pipeline.
-        RETURNS (Doc): The Doc with added entities, if available.
-
-        DOCS: https://spacy.io/api/entityruler#call
-        """
-        error_handler = self.get_error_handler()
-        try:
-            matches = self.match(doc)
-            self.set_annotations(doc, matches)
-            return doc
-        except Exception as e:
-            return error_handler(self.name, self, [doc], e)
-
-    def match(self, doc: Doc):
-        self._require_patterns()
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", message="\\[W036")
-            matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
-
-        final_matches = set(
-            [(m_id, start, end) for m_id, start, end in matches if start != end]
-        )
-        get_sort_key = lambda m: (m[2] - m[1], -m[1])
-        final_matches = sorted(final_matches, key=get_sort_key, reverse=True)
-        return final_matches
-
-    def set_annotations(self, doc, matches):
-        """Modify the document in place"""
-        entities = list(doc.ents)
-        new_entities = []
-        seen_tokens = set()
-        for match_id, start, end in matches:
-            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
-                continue
-            # check for end - 1 here because boundaries are inclusive
-            if start not in seen_tokens and end - 1 not in seen_tokens:
-                if match_id in self._ent_ids:
-                    label, ent_id = self._ent_ids[match_id]
-                    span = Span(doc, start, end, label=label, span_id=ent_id)
-                else:
-                    span = Span(doc, start, end, label=match_id)
-                new_entities.append(span)
-                entities = [
-                    e for e in entities if not (e.start < end and e.end > start)
-                ]
-                seen_tokens.update(range(start, end))
-        doc.ents = entities + new_entities
-
-    @property
-    def labels(self) -> Tuple[str, ...]:
-        """All labels present in the match patterns.
-
-        RETURNS (set): The string labels.
-
-        DOCS: https://spacy.io/api/entityruler#labels
-        """
-        keys = set(self.token_patterns.keys())
-        keys.update(self.phrase_patterns.keys())
-        all_labels = set()
-
-        for l in keys:
-            if self.ent_id_sep in l:
-                label, _ = self._split_label(l)
-                all_labels.add(label)
-            else:
-                all_labels.add(l)
-        return tuple(sorted(all_labels))
-
-    def initialize(
-        self,
-        get_examples: Callable[[], Iterable[Example]],
-        *,
-        nlp: Optional[Language] = None,
-        patterns: Optional[Sequence[PatternType]] = None,
-    ):
-        """Initialize the pipe for training.
-
-        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns a representative sample of gold-standard Example objects.
-        nlp (Language): The current nlp object the component is part of.
-        patterns Optional[Iterable[PatternType]]: The list of patterns.
-
-        DOCS: https://spacy.io/api/entityruler#initialize
-        """
-        self.clear()
-        if patterns:
-            self.add_patterns(patterns)  # type: ignore[arg-type]
-
-    @property
-    def ent_ids(self) -> Tuple[Optional[str], ...]:
-        """All entity ids present in the match patterns `id` properties
-
-        RETURNS (set): The string entity ids.
-
-        DOCS: https://spacy.io/api/entityruler#ent_ids
-        """
-        keys = set(self.token_patterns.keys())
-        keys.update(self.phrase_patterns.keys())
-        all_ent_ids = set()
-
-        for l in keys:
-            if self.ent_id_sep in l:
-                _, ent_id = self._split_label(l)
-                all_ent_ids.add(ent_id)
-        return tuple(all_ent_ids)
-
-    @property
-    def patterns(self) -> List[PatternType]:
-        """Get all patterns that were added to the entity ruler.
-
-        RETURNS (list): The original patterns, one dictionary per pattern.
-
-        DOCS: https://spacy.io/api/entityruler#patterns
-        """
-        all_patterns = []
-        for label, patterns in self.token_patterns.items():
-            for pattern in patterns:
-                ent_label, ent_id = self._split_label(label)
-                p = {"label": ent_label, "pattern": pattern}
-                if ent_id:
-                    p["id"] = ent_id
-                all_patterns.append(p)
-        for label, patterns in self.phrase_patterns.items():
-            for pattern in patterns:
-                ent_label, ent_id = self._split_label(label)
-                p = {"label": ent_label, "pattern": pattern.text}
-                if ent_id:
-                    p["id"] = ent_id
-                all_patterns.append(p)
-        return all_patterns
-
-    def add_patterns(self, patterns: List[PatternType]) -> None:
-        """Add patterns to the entity ruler. A pattern can either be a token
-        pattern (list of dicts) or a phrase pattern (string). For example:
-        {'label': 'ORG', 'pattern': 'Apple'}
-        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
-
-        patterns (list): The patterns to add.
-
-        DOCS: https://spacy.io/api/entityruler#add_patterns
-        """
-
-        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
-        try:
-            current_index = -1
-            for i, (name, pipe) in enumerate(self.nlp.pipeline):
-                if self == pipe:
-                    current_index = i
-                    break
-            subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
-        except ValueError:
-            subsequent_pipes = []
-        with self.nlp.select_pipes(disable=subsequent_pipes):
-            token_patterns = []
-            phrase_pattern_labels = []
-            phrase_pattern_texts = []
-            phrase_pattern_ids = []
-            for entry in patterns:
-                if isinstance(entry["pattern"], str):
-                    phrase_pattern_labels.append(entry["label"])
-                    phrase_pattern_texts.append(entry["pattern"])
-                    phrase_pattern_ids.append(entry.get("id"))
-                elif isinstance(entry["pattern"], list):
-                    token_patterns.append(entry)
-            phrase_patterns = []
-            for label, pattern, ent_id in zip(
-                phrase_pattern_labels,
-                self.nlp.pipe(phrase_pattern_texts),
-                phrase_pattern_ids,
-            ):
-                phrase_pattern = {"label": label, "pattern": pattern}
-                if ent_id:
-                    phrase_pattern["id"] = ent_id
-                phrase_patterns.append(phrase_pattern)
-            for entry in token_patterns + phrase_patterns:  # type: ignore[operator]
-                label = entry["label"]  # type: ignore
-                if "id" in entry:
-                    ent_label = label
-                    label = self._create_label(label, entry["id"])
-                    key = self.matcher._normalize_key(label)
-                    self._ent_ids[key] = (ent_label, entry["id"])
-                pattern = entry["pattern"]  # type: ignore
-                if isinstance(pattern, Doc):
-                    self.phrase_patterns[label].append(pattern)
-                    self.phrase_matcher.add(label, [pattern])  # type: ignore
-                elif isinstance(pattern, list):
-                    self.token_patterns[label].append(pattern)
-                    self.matcher.add(label, [pattern])
-                else:
-                    raise ValueError(Errors.E097.format(pattern=pattern))
-
-    def clear(self) -> None:
-        """Reset all patterns."""
-        self.token_patterns = defaultdict(list)
-        self.phrase_patterns = defaultdict(list)
-        self._ent_ids = defaultdict(tuple)
-        self.matcher = Matcher(
-            self.nlp.vocab,
-            validate=self._validate,
-            fuzzy_compare=self.matcher_fuzzy_compare,
-        )
-        self.phrase_matcher = PhraseMatcher(
-            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
-        )
-
-    def remove(self, ent_id: str) -> None:
-        """Remove a pattern by its ent_id if a pattern with this ent_id was added before
-
-        ent_id (str): id of the pattern to be removed
-        RETURNS: None
-        DOCS: https://spacy.io/api/entityruler#remove
-        """
-        label_id_pairs = [
-            (label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id
-        ]
-        if not label_id_pairs:
-            raise ValueError(
-                Errors.E1024.format(attr_type="ID", label=ent_id, component=self.name)
-            )
-        created_labels = [
-            self._create_label(label, eid) for (label, eid) in label_id_pairs
-        ]
-        # remove the patterns from self.phrase_patterns
-        self.phrase_patterns = defaultdict(
-            list,
-            {
-                label: val
-                for (label, val) in self.phrase_patterns.items()
-                if label not in created_labels
-            },
-        )
-        # remove the patterns from self.token_pattern
-        self.token_patterns = defaultdict(
-            list,
-            {
-                label: val
-                for (label, val) in self.token_patterns.items()
-                if label not in created_labels
-            },
-        )
-        # remove the patterns from self.token_pattern
-        for label in created_labels:
-            if label in self.phrase_matcher:
-                self.phrase_matcher.remove(label)
-            else:
-                self.matcher.remove(label)
-
-    def _require_patterns(self) -> None:
-        """Raise a warning if this component has no patterns defined."""
-        if len(self) == 0:
-            warnings.warn(Warnings.W036.format(name=self.name))
-
-    def _split_label(self, label: str) -> Tuple[str, Optional[str]]:
-        """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
-
-        label (str): The value of label in a pattern entry
-        RETURNS (tuple): ent_label, ent_id
-        """
-        if self.ent_id_sep in label:
-            ent_label, ent_id = label.rsplit(self.ent_id_sep, 1)
-        else:
-            ent_label = label
-            ent_id = None  # type: ignore
-        return ent_label, ent_id
-
-    def _create_label(self, label: Any, ent_id: Any) -> str:
-        """Join Entity label with ent_id if the pattern has an `id` attribute
-        If ent_id is not a string, the label is returned as is.
-
-        label (str): The label to set for ent.label_
-        ent_id (str): The label
-        RETURNS (str): The ent_label joined with configured `ent_id_sep`
-        """
-        if isinstance(ent_id, str):
-            label = f"{label}{self.ent_id_sep}{ent_id}"
-        return label
-
-    def from_bytes(
-        self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityRuler":
-        """Load the entity ruler from a bytestring.
-
-        patterns_bytes (bytes): The bytestring to load.
-        RETURNS (EntityRuler): The loaded entity ruler.
-
-        DOCS: https://spacy.io/api/entityruler#from_bytes
-        """
-        cfg = srsly.msgpack_loads(patterns_bytes)
-        self.clear()
-        if isinstance(cfg, dict):
-            self.add_patterns(cfg.get("patterns", cfg))
-            self.overwrite = cfg.get("overwrite", False)
-            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab,
-                attr=self.phrase_matcher_attr,
-            )
-            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
-        else:
-            self.add_patterns(cfg)
-        return self
-
-    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
-        """Serialize the entity ruler patterns to a bytestring.
-
-        RETURNS (bytes): The serialized patterns.
-
-        DOCS: https://spacy.io/api/entityruler#to_bytes
-        """
-        serial = {
-            "overwrite": self.overwrite,
-            "ent_id_sep": self.ent_id_sep,
-            "phrase_matcher_attr": self.phrase_matcher_attr,
-            "patterns": self.patterns,
-        }
-        return srsly.msgpack_dumps(serial)
-
-    def from_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> "EntityRuler":
-        """Load the entity ruler from a file. Expects a file containing
-        newline-delimited JSON (JSONL) with one entry per line.
-
-        path (str / Path): The JSONL file to load.
-        RETURNS (EntityRuler): The loaded entity ruler.
-
-        DOCS: https://spacy.io/api/entityruler#from_disk
-        """
-        path = ensure_path(path)
-        self.clear()
-        depr_patterns_path = path.with_suffix(".jsonl")
-        if path.suffix == ".jsonl":  # user provides a jsonl
-            if path.is_file:
-                patterns = srsly.read_jsonl(path)
-                self.add_patterns(patterns)
-            else:
-                raise ValueError(Errors.E1023.format(path=path))
-        elif depr_patterns_path.is_file():
-            patterns = srsly.read_jsonl(depr_patterns_path)
-            self.add_patterns(patterns)
-        elif path.is_dir():  # path is a valid directory
-            cfg = {}
-            deserializers_patterns = {
-                "patterns": lambda p: self.add_patterns(
-                    srsly.read_jsonl(p.with_suffix(".jsonl"))
-                )
-            }
-            deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
-            from_disk(path, deserializers_cfg, {})
-            self.overwrite = cfg.get("overwrite", False)
-            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
-            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
-
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab, attr=self.phrase_matcher_attr
-            )
-            from_disk(path, deserializers_patterns, {})
-        else:  # path is not a valid directory or file
-            raise ValueError(Errors.E146.format(path=path))
-        return self
-
-    def to_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
-    ) -> None:
-        """Save the entity ruler patterns to a directory. The patterns will be
-        saved as newline-delimited JSON (JSONL).
-
-        path (str / Path): The JSONL file to save.
-
-        DOCS: https://spacy.io/api/entityruler#to_disk
-        """
-        path = ensure_path(path)
-        cfg = {
-            "overwrite": self.overwrite,
-            "phrase_matcher_attr": self.phrase_matcher_attr,
-            "ent_id_sep": self.ent_id_sep,
-        }
-        serializers = {
-            "patterns": lambda p: srsly.write_jsonl(
-                p.with_suffix(".jsonl"), self.patterns
-            ),
-            "cfg": lambda p: srsly.write_json(p, cfg),
-        }
-        if path.suffix == ".jsonl":  # user wants to save only JSONL
-            srsly.write_jsonl(path, self.patterns)
-        else:
-            to_disk(path, serializers, {})
diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx
index 8a5dccd329b..bc9ec050323 100644
--- a/website/docs/api/entityruler.mdx
+++ b/website/docs/api/entityruler.mdx
@@ -1,7 +1,5 @@
 ---
 title: EntityRuler
-tag: class
-source: spacy/pipeline/entity_ruler.py
 new: 2.1
 teaser: 'Pipeline component for rule-based named entity recognition'
 api_string_name: entity_ruler
@@ -77,273 +75,51 @@ how the component should be configured. You can override its settings via the
 | `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                       |
 | `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                 |
 
-```python
-%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py
-```
-
-## EntityRuler.\_\_init\_\_ {id="init",tag="method"}
+## Migrating from v3 {#migrating}
 
-Initialize the entity ruler. If patterns are supplied here, they need to be a
-list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either
-be a token pattern (list) or a phrase pattern (string). For example:
-`{"label": "ORG", "pattern": "Apple"}`.
+### Loading patterns
 
-> #### Example
->
-> ```python
-> # Construction via add_pipe
-> ruler = nlp.add_pipe("entity_ruler")
->
-> # Construction from class
-> from spacy.pipeline import EntityRuler
-> ruler = EntityRuler(nlp, overwrite_ents=True)
-> ```
+Unlike the v3 `EntityRuler`, the `SpanRuler` cannot load patterns on
+initialization with `SpanRuler(patterns=patterns)` or directly from a JSONL file
+path with `SpanRuler.from_disk(jsonl_path)`. Patterns should be loaded from the
+JSONL file separately and then added through
+[`SpanRuler.initialize`](/api/spanruler#initialize]) or
+[`SpanRuler.add_patterns`](/api/spanruler#add_patterns).
 
-| Name                                                 | Description                                                                                                                                                                                                                           |
-| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `nlp`                                                | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~                                                                                                                                     |
-| `name` <Tag variant="new">3</Tag>                    | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
-| _keyword-only_                                       |                                                                                                                                                                                                                                       |
-| `phrase_matcher_attr`                                | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~                                         |
-| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~                                                                                                     |
-| `validate`                                           | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~                                                                                                                |
-| `overwrite_ents`                                     | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~                                                                                             |
-| `ent_id_sep`                                         | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~                                                                                                                                                               |
-| `patterns`                                           | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~                                                                                                                                 |
-| `scorer`                                             | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~                                                                                                                         |
-
-## EntityRuler.initialize {id="initialize",tag="method",version="3"}
-
-Initialize the component with data and used before training to load in rules
-from a [pattern file](/usage/rule-based-matching/#entityruler-files). This
-method is typically called by [`Language.initialize`](/api/language#initialize)
-and lets you customize arguments it receives via the
-[`[initialize.components]`](/api/data-formats#config-initialize) block in the
-config.
-
-> #### Example
->
-> ```python
-> entity_ruler = nlp.add_pipe("entity_ruler")
-> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
-> ```
->
-> ```ini
-> ### config.cfg
-> [initialize.components.entity_ruler]
->
-> [initialize.components.entity_ruler.patterns]
-> @readers = "srsly.read_jsonl.v1"
-> path = "corpus/entity_ruler_patterns.jsonl
-> ```
-
-| Name           | Description                                                                                                                                                          |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
-| _keyword-only_ |                                                                                                                                                                      |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                 |
-| `patterns`     | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~                                                        |
-
-## EntityRuler.\_\_len\_\_ {id="len",tag="method"}
-
-The number of all patterns added to the entity ruler.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> assert len(ruler) == 0
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-> assert len(ruler) == 1
-> ```
-
-| Name        | Description                     |
-| ----------- | ------------------------------- |
-| **RETURNS** | The number of patterns. ~~int~~ |
-
-## EntityRuler.\_\_contains\_\_ {id="contains",tag="method"}
-
-Whether a label is present in the patterns.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-> assert "ORG" in ruler
-> assert not "PERSON" in ruler
-> ```
-
-| Name        | Description                                           |
-| ----------- | ----------------------------------------------------- |
-| `label`     | The label to check. ~~str~~                           |
-| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ |
-
-## EntityRuler.\_\_call\_\_ {id="call",tag="method"}
-
-Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
-happens automatically after the component has been added to the pipeline using
-[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
-with `overwrite_ents=True`, existing entities will be replaced if they overlap
-with the matches. When matches overlap in a Doc, the entity ruler prioritizes
-longer patterns over shorter, and if equal the match occuring first in the Doc
-is chosen.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
->
-> doc = nlp("A text about Apple.")
-> ents = [(ent.text, ent.label_) for ent in doc.ents]
-> assert ents == [("Apple", "ORG")]
-> ```
-
-| Name        | Description                                                          |
-| ----------- | -------------------------------------------------------------------- |
-| `doc`       | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
-| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~        |
-
-## EntityRuler.add_patterns {id="add_patterns",tag="method"}
-
-Add patterns to the entity ruler. A pattern can either be a token pattern (list
-of dicts) or a phrase pattern (string). For more details, see the usage guide on
-[rule-based matching](/usage/rule-based-matching).
-
-> #### Example
->
-> ```python
-> patterns = [
->     {"label": "ORG", "pattern": "Apple"},
->     {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}
-> ]
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns(patterns)
-> ```
-
-| Name       | Description                                                      |
-| ---------- | ---------------------------------------------------------------- |
-| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
-
-## EntityRuler.remove {id="remove",tag="method",version="3.2.1"}
-
-Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if
-the ID does not exist.
-
-> #### Example
->
-> ```python
-> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}]
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.add_patterns(patterns)
-> ruler.remove("apple")
-> ```
-
-| Name | Description                         |
-| ---- | ----------------------------------- |
-| `id` | The ID of the pattern rule. ~~str~~ |
-
-## EntityRuler.to_disk {id="to_disk",tag="method"}
-
-Save the entity ruler patterns to a directory. The patterns will be saved as
-newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided,
-only the patterns are saved as JSONL. If a directory name is provided, a
-`patterns.jsonl` and `cfg` file with the component configuration is exported.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.to_disk("/path/to/patterns.jsonl")  # saves patterns only
-> ruler.to_disk("/path/to/entity_ruler")    # saves patterns and config
-> ```
-
-| Name   | Description                                                                                                                                              |
-| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-
-## EntityRuler.from_disk {id="from_disk",tag="method"}
-
-Load the entity ruler from a path. Expects either a file containing
-newline-delimited JSON (JSONL) with one entry per line, or a directory
-containing a `patterns.jsonl` file and a `cfg` file with the component
-configuration.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.from_disk("/path/to/patterns.jsonl")  # loads patterns only
-> ruler.from_disk("/path/to/entity_ruler")    # loads patterns and config
-> ```
-
-| Name        | Description                                                                                                   |
-| ----------- | ------------------------------------------------------------------------------------------------------------- |
-| `path`      | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~                                                            |
-
-## EntityRuler.to_bytes {id="to_bytes",tag="method"}
-
-Serialize the entity ruler patterns to a bytestring.
-
-> #### Example
->
-> ```python
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler_bytes = ruler.to_bytes()
-> ```
-
-| Name        | Description                        |
-| ----------- | ---------------------------------- |
-| **RETURNS** | The serialized patterns. ~~bytes~~ |
-
-## EntityRuler.from_bytes {id="from_bytes",tag="method"}
-
-Load the pipe from a bytestring. Modifies the object in place and returns it.
-
-> #### Example
->
-> ```python
-> ruler_bytes = ruler.to_bytes()
-> ruler = nlp.add_pipe("entity_ruler")
-> ruler.from_bytes(ruler_bytes)
-> ```
-
-| Name         | Description                                        |
-| ------------ | -------------------------------------------------- |
-| `bytes_data` | The bytestring to load. ~~bytes~~                  |
-| **RETURNS**  | The modified `EntityRuler` object. ~~EntityRuler~~ |
-
-## EntityRuler.labels {id="labels",tag="property"}
-
-All labels present in the match patterns.
-
-| Name        | Description                            |
-| ----------- | -------------------------------------- |
-| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.from_disk("patterns.jsonl")
++ import srsly
++ patterns = srsly.read_jsonl("patterns.jsonl")
++ ruler.add_patterns(patterns)
+```
 
-## EntityRuler.ent_ids {id="ent_ids",tag="property",version="2.2.2"}
+### Saving patterns
 
-All entity IDs present in the `id` properties of the match patterns.
+`SpanRuler.to_disk` always saves the full component data to a directory and does
+not include an option to save the patterns to a single JSONL file.
 
-| Name        | Description                         |
-| ----------- | ----------------------------------- |
-| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.to_disk("patterns.jsonl")
++ import srsly
++ srsly.write_jsonl("patterns.jsonl", ruler.patterns)
+```
 
-## EntityRuler.patterns {id="patterns",tag="property"}
+### Accessing token and phrase patterns
 
-Get all patterns that were added to the entity ruler.
+The separate token patterns and phrase patterns are no longer accessible under
+`ruler.token_patterns` or `ruler.phrase_patterns`. You can access the combined
+patterns in their original format using the property
+[`SpanRuler.patterns`](/api/spanruler#patterns).
 
-| Name        | Description                                                                              |
-| ----------- | ---------------------------------------------------------------------------------------- |
-| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ |
+### Removing patterns by ID
 
-## Attributes {id="attributes"}
+[`SpanRuler.remove`](/api/spanruler#remove) removes by label rather than ID. To
+remove by ID, use [`SpanRuler.remove_by_id`](/api/spanruler#remove_by_id):
 
-| Name              | Description                                                                                                           |
-| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `matcher`         | The underlying matcher used to process token patterns. ~~Matcher~~                                                    |
-| `phrase_matcher`  | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~                                      |
-| `token_patterns`  | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
-| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~                             |
+```diff
+ ruler = nlp.get_pipe("entity_ruler")
+- ruler.remove("id")
++ ruler.remove_by_id("id")
+```
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 2c02b0d8ee2..8469d587ed1 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1408,10 +1408,10 @@ doc2 = nlp("Apple is opening its first big office in San Fran.")
 print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
 ```
 
-If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
-patterns, the `id_` property of the matched entity is set to the `id` given
-in the patterns. So in the example above it's easy to identify that "San
-Francisco" and "San Fran" are both the same entity.
+If the `id` attribute is included in the [`entity_ruler`](/api/entityruler)
+patterns, the `id_` property of the matched entity is set to the `id` given in
+the patterns. So in the example above it's easy to identify that "San Francisco"
+and "San Fran" are both the same entity.
 
 ### Using pattern files {id="entityruler-files"}
 
diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx
index 420dc1d281e..3712fbeeb80 100644
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@@ -187,9 +187,9 @@ the data to and from a JSON file.
 
 > #### Real-world example
 >
-> To see custom serialization methods in action, check out the new
-> [`EntityRuler`](/api/entityruler) component and its
-> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the
+> To see custom serialization methods in action, check out the
+> [`SpanRuler`](/api/spanruler) component and its
+> [source](%%GITHUB_SPACY/spacy/pipeline/span_ruler.py). Patterns added to the
 > component will be saved to a `.jsonl` file if the pipeline is serialized to
 > disk, and to a bytestring if the pipeline is serialized to bytes. This allows
 > saving out a pipeline with rule-based components _with_ all the component

From 53d6eedb870f9bf7d3e922f52911d4345e31f6c2 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Wed, 23 Nov 2022 13:09:32 +0100
Subject: [PATCH 440/504] Remove sentiment extension (#11722)

* remove sentiment attribute

* remove sentiment from docs

* add test for backwards compatibility

* replace from_disk with from_bytes

* Fix docs and format file

* Fix formatting
---
 website/docs/api/span.mdx  | 1 -
 website/docs/api/token.mdx | 1 -
 2 files changed, 2 deletions(-)

diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx
index 7cf448f8f07..5d1b56daebb 100644
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@@ -567,5 +567,4 @@ overlaps with will be returned.
 | `ent_id_`                               | Alias for `id_`: the span's ID. ~~str~~                                                                                       |
 | `id`                                    | The hash value of the span's ID. ~~int~~                                                                                      |
 | `id_`                                   | The span's ID. ~~str~~                                                                                                        |
-| `sentiment`                             | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
 | `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
diff --git a/website/docs/api/token.mdx b/website/docs/api/token.mdx
index 12b99394350..16d421c12f4 100644
--- a/website/docs/api/token.mdx
+++ b/website/docs/api/token.mdx
@@ -470,7 +470,6 @@ The L2 norm of the token's vector representation.
 | `lang_`                                      | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                |
 | `prob`                                       | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                      |
 | `idx`                                        | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                |
-| `sentiment`                                  | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                       |
 | `lex_id`                                     | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `rank`                                       | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                 |
 | `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                            |

From bbd35d9275fde888920170900a011ac746fa11ef Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 8 Dec 2022 19:45:52 +0900
Subject: [PATCH 441/504] Remove old model shortcuts (#11916)

* Remove old model shortcuts

* Remove error, docs warnings about shortcuts

* Fix import in util

Accidentally deleted the whole import and not just the old part...

* Change universe example to v3 style

* Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6

* Update some model loading in Universe

* Add v2 tag to neuralcoref

* Use the spacy-version feature instead of a v2 tag

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/cli/download.py | 12 ++----------
 spacy/errors.py       |  9 ---------
 spacy/util.py         |  2 +-
 3 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 5e460717cc4..0b8ed54ed3c 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,16 +7,8 @@
 from wasabi import msg
 
 from .. import about
-from ..errors import OLD_MODEL_SHORTCUTS
-from ..util import (
-    get_minor_version,
-    is_in_interactive,
-    is_in_jupyter,
-    is_package,
-    is_prerelease_version,
-    run_command,
-)
-from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
+from ..util import is_package, get_minor_version, run_command
+from ..util import is_prerelease_version
 
 
 @app.command(
diff --git a/spacy/errors.py b/spacy/errors.py
index 75cf2545919..22532756e0c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -982,15 +982,6 @@ class Errors(metaclass=ErrorsWithCodes):
     E4000 = ("Expected a Doc as input, but got: '{type}'")
 
 
-# Deprecated model shortcuts, only used in errors and warnings
-OLD_MODEL_SHORTCUTS = {
-    "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
-    "pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
-    "nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
-    "lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
-}
-
-
 # fmt: on
 
 
diff --git a/spacy/util.py b/spacy/util.py
index fdc02a717cc..4f4718af5ff 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -61,7 +61,7 @@
 
 from .symbols import ORTH
 from .compat import cupy, CudaStream, is_windows, importlib_metadata
-from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
+from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, is_windows
 from .errors import Errors, Warnings

From 72889e520031859b3c3b4282060bbd6c80dd017a Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 12 Dec 2022 08:55:53 +0100
Subject: [PATCH 442/504] Custom extensions for spans with equal boundaries
 (#11429)

* Init

* Fix return type for mypy

* adjust types and improve setting new attributes

* Add underscore changes to json conversion

* Add test and underscore changes to from_docs

* add underscore changes and test to span.to_doc

* update return values

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Add types to function

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* adjust formatting

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* shorten return type

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* add helper function to improve readability

* Improve code and add comments

* rerun azure tests

* Fix tests for json conversion

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tests/doc/test_underscore.py |  4 ++++
 spacy/tokens/span.pyx              | 23 +++++++++++++++++++----
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index c9ed4a8bdaa..d09126a7b2c 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -4,6 +4,10 @@
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
+# Helper functions
+def _get_tuple(s: Span):
+    return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
+
 
 # Helper functions
 def _get_tuple(s: Span):
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index e9bc3c7311c..79a907abe6d 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -234,7 +234,7 @@ cdef class Span:
         """Custom extension attributes registered via `set_extension`."""
         cdef SpanC* span_c = self.span_c()
         return Underscore(Underscore.span_extensions, self,
-                          start=span_c.start_char, end=span_c.end_char)
+                          start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
 
     def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None):
         """Create a `Doc` object with a copy of the `Span`'s data.
@@ -829,21 +829,36 @@ cdef class Span:
             return self.span_c().label
 
         def __set__(self, attr_t label):
-            self.span_c().label = label
+            if label != self.span_c().label :
+                old_label = self.span_c().label
+                self.span_c().label = label
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id)
+                Underscore._replace_keys(old, new)
 
     property kb_id:
         def __get__(self):
             return self.span_c().kb_id
 
         def __set__(self, attr_t kb_id):
-            self.span_c().kb_id = kb_id
+            if kb_id != self.span_c().kb_id :
+                old_kb_id = self.span_c().kb_id
+                self.span_c().kb_id = kb_id
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id)
+                Underscore._replace_keys(old, new)
 
     property id:
         def __get__(self):
             return self.span_c().id
 
         def __set__(self, attr_t id):
-            self.span_c().id = id
+            if id != self.span_c().id :
+                old_id = self.span_c().id
+                self.span_c().id = id
+                new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id)
+                old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id)
+                Underscore._replace_keys(old, new)
 
     property ent_id:
         """Alias for the span's ID."""

From cc574c14fed7f4864f2fd454fe222697c9d40155 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Sat, 17 Dec 2022 14:32:19 +0100
Subject: [PATCH 443/504] Fix v4 branch to build against Thinc v9 (#11921)

* Move `thinc.extra.search` to `spacy.pipeline._parser_internals`

Backport of:
https://github.com/explosion/spaCy/pull/11317

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Replace references to `thinc.backends.linalg` with `CBlas`

Backport of:
https://github.com/explosion/spaCy/pull/11292

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Use cross entropy from `thinc.legacy`

* Require thinc>=9.0.0.dev0,<9.1.0

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 pyproject.toml                   | 5 ++---
 requirements.txt                 | 6 +++---
 setup.cfg                        | 4 ++--
 spacy/pipeline/morphologizer.pyx | 2 +-
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index bfd7e68d1f7..77e1471426f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,9 +5,8 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.2.2,<8.3.0",
-    "numpy>=1.15.0; python_version < '3.9'",
-    "numpy>=1.25.0; python_version >= '3.9'",
+    "thinc>=9.0.0.dev0,<9.1.0",
+    "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index fe695a445c5..699057643c9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
 # Our libraries
-spacy-legacy>=4.0.0.dev0,<4.1.0
+spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev2,<9.1.0
+thinc>=9.0.0.dev0,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
@@ -30,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<1.1.0; platform_machine != "aarch64"
+mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
 types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
diff --git a/setup.cfg b/setup.cfg
index 2b41ab339c4..5959d398115 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,8 +38,8 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.2.2,<8.3.0
-    wasabi>=0.9.1,<1.2.0
+    thinc>=9.0.0.dev0,<9.1.0
+    wasabi>=0.9.1,<1.1.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
     weasel>=0.1.0,<0.5.0
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index cc8f87936b9..f822c38ac0e 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -302,7 +302,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
         truths = []
         for eg in examples:
             eg_truths = []

From e21e5d29bc83013529749b7e0cdeb5e3795b89be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 29 Dec 2022 08:03:24 +0100
Subject: [PATCH 444/504] Adjust to new `Schedule` class and pass scores to
 `Optimizer` (#12008)

* Adjust to new `Schedule` class and pass scores to `Optimizer`

Requires https://github.com/explosion/thinc/pull/804

* Bump minimum Thinc requirement to 9.0.0.dev1
---
 pyproject.toml   |  2 +-
 requirements.txt |  2 +-
 setup.cfg        |  4 ++--
 spacy/util.py    | 13 +++++++++----
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 77e1471426f..58e4382e829 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev0,<9.1.0",
+    "thinc>=9.0.0.dev1,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 699057643c9..5c889c91f81 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev0,<9.1.0
+thinc>=9.0.0.dev1,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 5959d398115..9d4ccc7a4b3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,8 +38,8 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev0,<9.1.0
-    wasabi>=0.9.1,<1.1.0
+    thinc>=9.0.0.dev1,<9.1.0
+    wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
     weasel>=0.1.0,<0.5.0
diff --git a/spacy/util.py b/spacy/util.py
index 4f4718af5ff..a76e8f73eeb 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -2,7 +2,12 @@
 import importlib
 import importlib.metadata
 import importlib.util
-import inspect
+import re
+from pathlib import Path
+import thinc
+from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
+from thinc.api import ConfigValidationError, Model, constant as constant_schedule
+import functools
 import itertools
 import logging
 import os
@@ -1617,12 +1622,12 @@ def minibatch(items, size):
     so that batch-size can vary on each step.
     """
     if isinstance(size, int):
-        size_ = itertools.repeat(size)
+        size_ = constant_schedule(size)
     else:
         size_ = iter(size)
     items = iter(items)
-    while True:
-        batch_size = next(size_)
+    for step in itertools.count():
+        batch_size = size_(step)
         batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break

From 4be02fc351092b4bd1ac68fcd102029fa133e55e Mon Sep 17 00:00:00 2001
From: Tetsuo Kiso <tetsuok@users.noreply.github.com>
Date: Wed, 4 Jan 2023 01:43:09 +0900
Subject: [PATCH 445/504] Delete unused imports for StringStore (#12040)

---
 spacy/tokenizer.pxd | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index b2e50969462..2610532b75d 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -2,6 +2,10 @@ from cymem.cymem cimport Pool
 from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 
+from .typedefs cimport hash_t
+from .structs cimport LexemeC, SpanC, TokenC
+from .tokens.doc cimport Doc
+from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
 from .structs cimport LexemeC, SpanC, TokenC
 from .tokens.doc cimport Doc

From a31966da9476de169c4c00f510c58e8af9bf2a42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 16 Jan 2023 10:25:53 +0100
Subject: [PATCH 446/504] Add
 `TrainablePipe.{distill,get_teacher_student_loss}` (#12016)

* Add `TrainablePipe.{distill,get_teacher_student_loss}`

This change adds two methods:

- `TrainablePipe::distill` which performs a training step of a
   student pipe on a teacher pipe, giving a batch of `Doc`s.
- `TrainablePipe::get_teacher_student_loss` computes the loss
  of a student relative to the teacher.

The `distill` or `get_teacher_student_loss` methods are also implemented
in the tagger, edit tree lemmatizer, and parser pipes, to enable
distillation in those pipes and as an example for other pipes.

* Fix stray `Beam` import

* Fix incorrect import

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* TrainablePipe.distill: use `Iterable[Example]`

* Add Pipe.is_distillable method

* Add `validate_distillation_examples`

This first calls `validate_examples` and then checks that the
student/teacher tokens are the same.

* Update distill documentation

* Add distill documentation for all pipes that support distillation

* Fix incorrect identifier

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Add comment to explain `is_distillable`

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/training/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index adfc2bb6658..9445d0b63a5 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,3 +1,6 @@
+from .corpus import Corpus, JsonlCorpus  # noqa: F401
+from .example import Example, validate_examples, validate_get_examples  # noqa: F401
+from .example import validate_distillation_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401

From 10fccff7760bef01b61033f65138dbe4ca5c9b72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 18 Jan 2023 11:27:45 +0100
Subject: [PATCH 447/504] Merge the parser refactor into `v4` (#10940)

* Try to fix doc.copy

* Set dev version

* Make vocab always own lexemes

* Change version

* Add SpanGroups.copy method

* Fix set_annotations during Parser.update

* Fix dict proxy copy

* Upd version

* Fix copying SpanGroups

* Fix set_annotations in parser.update

* Fix parser set_annotations during update

* Revert "Fix parser set_annotations during update"

This reverts commit eb138c89edb306608826dca50619ea8a60de2b14.

* Revert "Fix set_annotations in parser.update"

This reverts commit c6df0eafd0046179c1c9fb7840074edf04e4721d.

* Fix set_annotations during parser update

* Inc version

* Handle final states in get_oracle_sequence

* Inc version

* Try to fix parser training

* Inc version

* Fix

* Inc version

* Fix parser oracle

* Inc version

* Inc version

* Fix transition has_gold

* Inc version

* Try to use real histories, not oracle

* Inc version

* Upd parser

* Inc version

* WIP on rewrite parser

* WIP refactor parser

* New progress on parser model refactor

* Prepare to remove parser_model.pyx

* Convert parser from cdef class

* Delete spacy.ml.parser_model

* Delete _precomputable_affine module

* Wire up tb_framework to new parser model

* Wire up parser model

* Uncython ner.pyx and dep_parser.pyx

* Uncython

* Work on parser model

* Support unseen_classes in parser model

* Support unseen classes in parser

* Cleaner handling of unseen classes

* Work through tests

* Keep working through errors

* Keep working through errors

* Work on parser. 15 tests failing

* Xfail beam stuff. 9 failures

* More xfail. 7 failures

* Xfail. 6 failures

* cleanup

* formatting

* fixes

* pass nO through

* Fix empty doc in update

* Hackishly fix resizing. 3 failures

* Fix redundant test. 2 failures

* Add reference version

* black formatting

* Get tests passing with reference implementation

* Fix missing prints

* Add missing file

* Improve indexing on reference implementation

* Get non-reference forward func working

* Start rigging beam back up

* removing redundant tests, cf #8106

* black formatting

* temporarily xfailing issue 4314

* make flake8 happy again

* mypy fixes

* ensure labels are added upon predict

* cleanup remnants from merge conflicts

* Improve unseen label masking

Two changes to speed up masking by ~10%:

- Use a bool array rather than an array of float32.

- Let the mask indicate whether a label was seen, rather than
  unseen. The mask is most frequently used to index scores for
  seen labels. However, since the mask marked unseen labels,
  this required computing an intermittent flipped mask.

* Write moves costs directly into numpy array (#10163)

This avoids elementwise indexing and the allocation of an additional
array.

Gives a ~15% speed improvement when using batch_by_sequence with size
32.

* Temporarily disable ner and rehearse tests

Until rehearse is implemented again in the refactored parser.

* Fix loss serialization issue (#10600)

* Fix loss serialization issue

Serialization of a model fails with:

TypeError: array(738.3855, dtype=float32) is not JSON serializable

Fix this using float conversion.

* Disable CI steps that require spacy.TransitionBasedParser.v2

After finishing the refactor, TransitionBasedParser.v2 should be
provided for backwards compat.

* Add back support for beam parsing to the refactored parser (#10633)

* Add back support for beam parsing

Beam parsing was already implemented as part of the `BeamBatch` class.
This change makes its counterpart `GreedyBatch`. Both classes are hooked
up in `TransitionModel`, selecting `GreedyBatch` when the beam size is
one, or `BeamBatch` otherwise.

* Use kwarg for beam width

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Avoid implicit default for beam_width and beam_density

* Parser.{beam,greedy}_parse: ensure labels are added

* Remove 'deprecated' comments

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Parser `StateC` optimizations (#10746)

* `StateC`: Optimizations

Avoid GIL acquisition in `__init__`
Increase default buffer capacities on init
Reduce C++ exception overhead

* Fix typo

* Replace `set::count` with `set::find`

* Add exception attribute to c'tor

* Remove unused import

* Use a power-of-two value for initial capacity
Use default-insert to init `_heads` and `_unshiftable`

* Merge `cdef` variable declarations and assignments

* Vectorize `example.get_aligned_parses` (#10789)

* `example`: Vectorize `get_aligned_parse`
Rename `numpy` import

* Convert aligned array to lists before returning

* Revert import renaming

* Elide slice arguments when selecting the entire range

* Tagger/morphologizer alignment performance optimizations (#10798)

* `example`: Unwrap `numpy` scalar arrays before passing them to `StringStore.__getitem__`

* `AlignmentArray`: Use native list as staging buffer for offset calculation

* `example`: Vectorize `get_aligned`

* Hoist inner functions out of `get_aligned`

* Replace inline `if..else` clause in assignment statement

* `AlignmentArray`: Use raw indexing into offset and data `numpy` arrays

* `example`: Replace array unique value check with `groupby`

* `example`: Correctly exclude tokens with no alignment in `_get_aligned_vectorized`
Simplify `_get_aligned_non_vectorized`

* `util`: Update `all_equal` docstring

* Explicitly use `int32_t*`

* Restore C CPU inference in the refactored parser (#10747)

* Bring back the C parsing model

The C parsing model is used for CPU inference and is still faster for
CPU inference than the forward pass of the Thinc model.

* Use C sgemm provided by the Ops implementation

* Make tb_framework module Cython, merge in C forward implementation

* TransitionModel: raise in backprop returned from forward_cpu

* Re-enable greedy parse test

* Return transition scores when forward_cpu is used

* Apply suggestions from code review

Import `Model` from `thinc.api`

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Use relative imports in tb_framework

* Don't assume a default for beam_width

* We don't have a direct dependency on BLIS anymore

* Rename forwards to _forward_{fallback,greedy_cpu}

* Require thinc >=8.1.0,<8.2.0

* tb_framework: clean up imports

* Fix return type of _get_seen_mask

* Move up _forward_greedy_cpu

* Style fixes.

* Lower thinc lowerbound to 8.1.0.dev0

* Formatting fix

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Reimplement parser rehearsal function (#10878)

* Reimplement parser rehearsal function

Before the parser refactor, rehearsal was driven by a loop in the
`rehearse` method itself. For each parsing step, the loops would:

1. Get the predictions of the teacher.
2. Get the predictions and backprop function of the student.
3. Compute the loss and backprop into the student.
4. Move the teacher and student forward with the predictions of
   the student.

In the refactored parser, we cannot perform search stepwise rehearsal
anymore, since the model now predicts all parsing steps at once.
Therefore, rehearsal is performed in the following steps:

1. Get the predictions of all parsing steps from the student, along
   with its backprop function.
2. Get the predictions from the teacher, but use the predictions of
   the student to advance the parser while doing so.
3. Compute the loss and backprop into the student.

To support the second step a new method, `advance_with_actions` is
added to `GreedyBatch`, which performs the provided parsing steps.

* tb_framework: wrap upper_W and upper_b in Linear

Thinc's Optimizer cannot handle resizing of existing parameters. Until
it does, we work around this by wrapping the weights/biases of the upper
layer of the parser model in Linear. When the upper layer is resized, we
copy over the existing parameters into a new Linear instance. This does
not trigger an error in Optimizer, because it sees the resized layer as
a new set of parameters.

* Add test for TransitionSystem.apply_actions

* Better FIXME marker

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Fixes from Madeesh

* Apply suggestions from Sofie

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remove useless assignment

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename some identifiers in the parser refactor (#10935)

* Rename _parseC to _parse_batch

* tb_framework: prefix many auxiliary functions with underscore

To clearly state the intent that they are private.

* Rename `lower` to `hidden`, `upper` to `output`

* Parser slow test fixup

We don't have TransitionBasedParser.{v1,v2} until we bring it back as a
legacy option.

* Remove last vestiges of PrecomputableAffine

This does not exist anymore as a separate layer.

* ner: re-enable sentence boundary checks

* Re-enable test that works now.

* test_ner: make loss test more strict again

* Remove commented line

* Re-enable some more beam parser tests

* Remove unused _forward_reference function

* Update for CBlas changes in Thinc 8.1.0.dev2

Bump thinc dependency to 8.1.0.dev3.

* Remove references to spacy.TransitionBasedParser.{v1,v2}

Since they will not be offered starting with spaCy v4.

* `tb_framework`: Replace references to `thinc.backends.linalg` with `CBlas`

* dont use get_array_module (#11056) (#11293)

Co-authored-by: kadarakos <kadar.akos@gmail.com>

* Move `thinc.extra.search` to `spacy.pipeline._parser_internals` (#11317)

* `search`: Move from `thinc.extra.search`
Fix NPE in `Beam.__dealloc__`

* `pytest`: Add support for executing Cython tests
Move `search` tests from thinc and patch them to run with `pytest`

* `mypy` fix

* Update comment

* `conftest`: Expose `register_cython_tests`

* Remove unused import

* Move `argmax` impls to new `_parser_utils` Cython module (#11410)

* Parser does not have to be a cdef class anymore

This also fixes validation of the initialization schema.

* Add back spacy.TransitionBasedParser.v2

* Fix a rename that was missed in #10878.

So that rehearsal tests pass.

* Remove module from setup.py that got added during the merge

* Bring back support for `update_with_oracle_cut_size` (#12086)

* Bring back support for `update_with_oracle_cut_size`

This option was available in the pre-refactor parser, but was never
implemented in the refactored parser. This option cuts transition
sequences that are longer than `update_with_oracle_cut` size into
separate sequences that have at most `update_with_oracle_cut`
transitions. The oracle (gold standard) transition sequence is used to
determine the cuts and the initial states for the additional sequences.

Applying this cut makes the batches more homogeneous in the transition
sequence lengths, making forward passes (and as a consequence training)
much faster.

Training time 1000 steps on de_core_news_lg:

- Before this change: 149s
- After this change: 68s
- Pre-refactor parser: 81s

* Fix a rename that was missed in #10878.

So that rehearsal tests pass.

* Apply suggestions from @shadeMe

* Use chained conditional

* Test with update_with_oracle_cut_size={0, 1, 5, 100}

And fix a git that occurs with a cut size of 1.

* Fix up some merge fall out

* Update parser distillation for the refactor

In the old parser, we'd iterate over the transitions in the distill
function and compute the loss/gradients on the go. In the refactored
parser, we first let the student model parse the inputs. Then we'll let
the teacher compute the transition probabilities of the states in the
student's transition sequence. We can then compute the gradients of the
student given the teacher.

* Add back spacy.TransitionBasedParser.v1 references

- Accordion in the architecture docs.
- Test in test_parse, but disabled until we have a spacy-legacy release.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: kadarakos <kadar.akos@gmail.com>
---
 setup.py                                      |   6 +-
 spacy/cli/templates/quickstart_training.jinja |  12 +-
 spacy/ml/_precomputable_affine.py             | 164 -----
 spacy/ml/tb_framework.pxd                     |  28 +
 spacy/ml/tb_framework.py                      |  51 --
 spacy/ml/tb_framework.pyx                     | 621 ++++++++++++++++++
 .../_parser_internals/_parser_utils.pxd       |   2 +
 .../_parser_internals/_parser_utils.pyx       |  22 +
 spacy/pipeline/_parser_internals/_state.pxd   |  59 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 +
 spacy/pipeline/_parser_internals/batch.pxd    |   2 +
 spacy/pipeline/_parser_internals/batch.pyx    |  52 ++
 spacy/pipeline/_parser_internals/ner.pyx      |   2 +
 .../pipeline/_parser_internals/stateclass.pyx |   7 +
 .../_parser_internals/transition_system.pxd   |   7 +
 .../_parser_internals/transition_system.pyx   |  70 ++
 .../{dep_parser.pyx => dep_parser.py}         |  17 +-
 spacy/pipeline/{ner.pyx => ner.py}            |  33 +-
 spacy/tests/parser/test_ner.py                |  11 +-
 spacy/tests/parser/test_parse.py              |  80 ++-
 spacy/tests/pipeline/test_tok2vec.py          |   2 +-
 .../tests/serialize/test_serialize_config.py  |  56 +-
 spacy/tests/test_misc.py                      |  53 +-
 website/docs/api/architectures.mdx            |  26 +-
 website/docs/api/legacy.mdx                   |   2 +-
 25 files changed, 1006 insertions(+), 382 deletions(-)
 delete mode 100644 spacy/ml/_precomputable_affine.py
 create mode 100644 spacy/ml/tb_framework.pxd
 delete mode 100644 spacy/ml/tb_framework.py
 create mode 100644 spacy/ml/tb_framework.pyx
 create mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pxd
 create mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pyx
 create mode 100644 spacy/pipeline/_parser_internals/batch.pxd
 create mode 100644 spacy/pipeline/_parser_internals/batch.pyx
 rename spacy/pipeline/{dep_parser.pyx => dep_parser.py} (97%)
 rename spacy/pipeline/{ner.pyx => ner.py} (93%)

diff --git a/setup.py b/setup.py
index 0eb529c2098..a4a87d68aec 100755
--- a/setup.py
+++ b/setup.py
@@ -32,12 +32,10 @@
     "spacy.kb.candidate",
     "spacy.kb.kb",
     "spacy.kb.kb_in_memory",
-    "spacy.ml.parser_model",
+    "spacy.ml.tb_framework",
     "spacy.morphology",
-    "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
-    "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
     "spacy.pipeline.sentencizer",
@@ -45,6 +43,7 @@
     "spacy.pipeline.tagger",
     "spacy.pipeline.transition_parser",
     "spacy.pipeline._parser_internals.arc_eager",
+    "spacy.pipeline._parser_internals.batch",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
     "spacy.pipeline._parser_internals.search",
@@ -52,6 +51,7 @@
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.pipeline._parser_internals._beam_utils",
+    "spacy.pipeline._parser_internals._parser_utils",
     "spacy.tokenizer",
     "spacy.training.align",
     "spacy.training.gold_io",
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 2817147f3e9..36325711d4d 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -90,12 +90,11 @@ grad_factor = 1.0
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = false
 nO = null
 
 [components.parser.model.tok2vec]
@@ -111,12 +110,11 @@ grad_factor = 1.0
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = false
 nO = null
 
 [components.ner.model.tok2vec]
@@ -387,12 +385,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = true
 nO = null
 
 [components.parser.model.tok2vec]
@@ -405,12 +402,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
deleted file mode 100644
index 1c20c622b2c..00000000000
--- a/spacy/ml/_precomputable_affine.py
+++ /dev/null
@@ -1,164 +0,0 @@
-from thinc.api import Model, normal_init
-
-from ..util import registry
-
-
-@registry.layers("spacy.PrecomputableAffine.v1")
-def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
-    model = Model(
-        "precomputable_affine",
-        forward,
-        init=init,
-        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
-        params={"W": None, "b": None, "pad": None},
-        attrs={"dropout_rate": dropout},
-    )
-    return model
-
-
-def forward(model, X, is_train):
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.get_param("W")
-    # Preallocate array for layer output, including padding.
-    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
-    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
-    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
-
-    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
-    # change its shape to (nF, nO, nP) without breaking existing models. So
-    # we'll squeeze the first dimension here.
-    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
-
-    def backward(dY_ids):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nO, nP), and get back:
-        # (nB, nO, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nO, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        assert dY.ndim == 3
-        assert dY.shape[1] == nO, dY.shape
-        assert dY.shape[2] == nP, dY.shape
-        # nB = dY.shape[0]
-        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
-        Xf = X[ids]
-        Xf = Xf.reshape((Xf.shape[0], nF * nI))
-
-        model.inc_grad("b", dY.sum(axis=0))
-        dY = dY.reshape((dY.shape[0], nO * nP))
-
-        Wopfi = W.transpose((1, 2, 0, 3))
-        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
-        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
-
-        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
-        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
-        # (o, p, f, i) --> (f, o, p, i)
-        dWopfi = dWopfi.transpose((2, 0, 1, 3))
-        model.inc_grad("W", dWopfi)
-        return dXf.reshape((dXf.shape[0], nF, nI))
-
-    return Yf, backward
-
-
-def _backprop_precomputable_affine_padding(model, dY, ids):
-    nB = dY.shape[0]
-    nF = model.get_dim("nF")
-    nP = model.get_dim("nP")
-    nO = model.get_dim("nO")
-    # Backprop the "padding", used as a filler for missing values.
-    # Values that are missing are set to -1, and each state vector could
-    # have multiple missing values. The padding has different values for
-    # different missing features. The gradient of the padding vector is:
-    #
-    # for b in range(nB):
-    #     for f in range(nF):
-    #         if ids[b, f] < 0:
-    #             d_pad[f] += dY[b]
-    #
-    # Which can be rewritten as:
-    #
-    # (ids < 0).T @ dY
-    mask = model.ops.asarray(ids < 0, dtype="f")
-    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
-    return d_pad.reshape((1, nF, nO, nP))
-
-
-def init(model, X=None, Y=None):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    if model.has_param("W") and model.get_param("W").any():
-        return
-
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nO, nP, nI)
-    b = model.ops.alloc2f(nO, nP)
-    pad = model.ops.alloc4f(1, nF, nO, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
-        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors = vectors.reshape((vectors.shape[0], nO, nP))
-        vectors += b
-        vectors = model.ops.asarray(vectors)
-        if nP >= 2:
-            return model.ops.maxout(vectors)[0]
-        else:
-            return vectors * (vectors >= 0)
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = model.get_param("W").copy()
-    b = model.get_param("b").copy()
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("b", b)
-        else:
-            break
diff --git a/spacy/ml/tb_framework.pxd b/spacy/ml/tb_framework.pxd
new file mode 100644
index 00000000000..965508519e8
--- /dev/null
+++ b/spacy/ml/tb_framework.pxd
@@ -0,0 +1,28 @@
+from libc.stdint cimport int8_t
+
+
+cdef struct SizesC:
+    int states
+    int classes
+    int hiddens
+    int pieces
+    int feats
+    int embed_width
+    int tokens
+
+
+cdef struct WeightsC:
+    const float* feat_weights
+    const float* feat_bias
+    const float* hidden_bias
+    const float* hidden_weights
+    const int8_t* seen_mask
+
+
+cdef struct ActivationsC:
+    int* token_ids
+    float* unmaxed
+    float* hiddens
+    int* is_valid
+    int _curr_size
+    int _max_size
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
deleted file mode 100644
index e351ad4e570..00000000000
--- a/spacy/ml/tb_framework.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from thinc.api import Model, noop
-
-from ..util import registry
-from .parser_model import ParserStepModel
-
-
-@registry.layers("spacy.TransitionModel.v1")
-def TransitionModel(
-    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
-):
-    """Set up a stepwise transition-based model"""
-    if upper is None:
-        has_upper = False
-        upper = noop()
-    else:
-        has_upper = True
-    # don't define nO for this object, because we can't dynamically change it
-    return Model(
-        name="parser_model",
-        forward=forward,
-        dims={"nI": tok2vec.maybe_get_dim("nI")},
-        layers=[tok2vec, lower, upper],
-        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
-        init=init,
-        attrs={
-            "has_upper": has_upper,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def forward(model, X, is_train):
-    step_model = ParserStepModel(
-        X,
-        model.layers,
-        unseen_classes=model.attrs["unseen_classes"],
-        train=is_train,
-        has_upper=model.attrs["has_upper"],
-    )
-
-    return step_model, step_model.finish_steps
-
-
-def init(model, X=None, Y=None):
-    model.get_ref("tok2vec").initialize(X=X)
-    lower = model.get_ref("lower")
-    lower.initialize()
-    if model.attrs["has_upper"]:
-        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
-        model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
new file mode 100644
index 00000000000..79be13b00bd
--- /dev/null
+++ b/spacy/ml/tb_framework.pyx
@@ -0,0 +1,621 @@
+# cython: infer_types=True, cdivision=True, boundscheck=False
+from typing import List, Tuple, Any, Optional, TypeVar, cast
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from libcpp.vector cimport vector
+import numpy
+cimport numpy as np
+from thinc.api import Model, normal_init, chain, list2array, Linear
+from thinc.api import uniform_init, glorot_uniform_init, zero_init
+from thinc.api import NumpyOps
+from thinc.backends.cblas cimport CBlas, saxpy, sgemm
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
+from thinc.types import Ints1d, Ints2d
+
+from ..errors import Errors
+from ..pipeline._parser_internals import _beam_utils
+from ..pipeline._parser_internals.batch import GreedyBatch
+from ..pipeline._parser_internals._parser_utils cimport arg_max
+from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
+from ..pipeline._parser_internals.transition_system cimport TransitionSystem
+from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..tokens.doc import Doc
+from ..util import registry
+
+
+State = Any  # TODO
+
+
+@registry.layers("spacy.TransitionModel.v2")
+def TransitionModel(
+    *,
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    beam_width: int = 1,
+    beam_density: float = 0.0,
+    state_tokens: int,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+    unseen_classes=set(),
+) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
+    """Set up a transition-based parsing model, using a maxout hidden
+    layer and a linear output layer.
+    """
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
+    tok2vec_projected.set_dim("nO", hidden_width)
+
+    # FIXME: we use `output` as a container for the output layer's
+    # weights and biases. Thinc optimizers cannot handle resizing
+    # of parameters. So, when the parser model is resized, we
+    # construct a new `output` layer, which has a different key in
+    # the optimizer. Once the optimizer supports parameter resizing,
+    # we can replace the `output` layer by `output_W` and `output_b`
+    # parameters in this model.
+    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
+
+    return Model(
+        name="parser_model",
+        forward=forward,
+        init=init,
+        layers=[tok2vec_projected, output],
+        refs={
+            "tok2vec": tok2vec_projected,
+            "output": output,
+        },
+        params={
+            "hidden_W": None,  # Floats2d W for the hidden layer
+            "hidden_b": None,  # Floats1d bias for the hidden layer
+            "hidden_pad": None,  # Floats1d padding for the hidden layer
+        },
+        dims={
+            "nO": None,  # Output size
+            "nP": maxout_pieces,
+            "nH": hidden_width,
+            "nI": tok2vec_projected.maybe_get_dim("nO"),
+            "nF": state_tokens,
+        },
+        attrs={
+            "beam_width": beam_width,
+            "beam_density": beam_density,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def resize_output(model: Model, new_nO: int) -> Model:
+    old_nO = model.maybe_get_dim("nO")
+    output = model.get_ref("output")
+    if old_nO is None:
+        model.set_dim("nO", new_nO)
+        output.set_dim("nO", new_nO)
+        output.initialize()
+        return model
+    elif new_nO <= old_nO:
+        return model
+    elif output.has_param("W"):
+        nH = model.get_dim("nH")
+        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
+        new_output.initialize()
+        new_W = new_output.get_param("W")
+        new_b = new_output.get_param("b")
+        old_W = output.get_param("W")
+        old_b = output.get_param("b")
+        new_W[:old_nO] = old_W  # type: ignore
+        new_b[:old_nO] = old_b  # type: ignore
+        for i in range(old_nO, new_nO):
+            model.attrs["unseen_classes"].add(i)
+        model.layers[-1] = new_output
+        model.set_ref("output", new_output)
+    # TODO: Avoid this private intrusion
+    model._dims["nO"] = new_nO
+    return model
+
+
+def init(
+    model,
+    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
+    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
+):
+    if X is not None:
+        docs, moves = X
+        model.get_ref("tok2vec").initialize(X=docs)
+    else:
+        model.get_ref("tok2vec").initialize()
+    inferred_nO = _infer_nO(Y)
+    if inferred_nO is not None:
+        current_nO = model.maybe_get_dim("nO")
+        if current_nO is None or current_nO != inferred_nO:
+            model.attrs["resize_output"](model, inferred_nO)
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nH = model.get_dim("nH")
+    nI = model.get_dim("nI")
+    nF = model.get_dim("nF")
+    ops = model.ops
+
+    Wl = ops.alloc2f(nH * nP, nF * nI)
+    bl = ops.alloc1f(nH * nP)
+    padl = ops.alloc1f(nI)
+    # Wl = zero_init(ops, Wl.shape)
+    Wl = glorot_uniform_init(ops, Wl.shape)
+    padl = uniform_init(ops, padl.shape)  # type: ignore
+    # TODO: Experiment with whether better to initialize output_W
+    model.set_param("hidden_W", Wl)
+    model.set_param("hidden_b", bl)
+    model.set_param("hidden_pad", padl)
+    # model = _lsuv_init(model)
+    return model
+
+
+class TransitionModelInputs:
+    """
+    Input to transition model.
+    """
+
+    # dataclass annotation is not yet supported in Cython 0.29.x,
+    # so, we'll do something close to it.
+
+    actions: Optional[List[Ints1d]]
+    docs: List[Doc]
+    max_moves: int
+    moves: TransitionSystem
+    states: Optional[List[State]]
+
+    __slots__ = [
+        "actions",
+        "docs",
+        "max_moves",
+        "moves",
+        "states",
+    ]
+
+    def __init__(
+        self,
+        docs: List[Doc],
+        moves: TransitionSystem,
+        actions: Optional[List[Ints1d]]=None,
+        max_moves: int=0,
+        states: Optional[List[State]]=None):
+        """
+        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
+        docs (List[Doc]): Docs to predict transition sequences for.
+        max_moves: (int): the maximum number of moves to apply, values less
+            than 1 will apply moves to states until they are final states.
+        moves (TransitionSystem): the transition system to use when predicting
+            the transition sequences.
+        states (Optional[List[States]]): the initial states to predict the
+            transition sequences for. When absent, the initial states are
+            initialized from the provided Docs.
+        """
+        self.actions = actions
+        self.docs = docs
+        self.moves = moves
+        self.max_moves = max_moves
+        self.states = states
+
+
+def forward(model, inputs: TransitionModelInputs, is_train: bool):
+    docs = inputs.docs
+    moves = inputs.moves
+    actions = inputs.actions
+
+    beam_width = model.attrs["beam_width"]
+    hidden_pad = model.get_param("hidden_pad")
+    tok2vec = model.get_ref("tok2vec")
+
+    states = moves.init_batch(docs) if inputs.states is None else inputs.states
+    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
+    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
+    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
+    seen_mask = _get_seen_mask(model)
+
+    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
+        # Note: max_moves is only used during training, so we don't need to
+        #       pass it to the greedy inference path.
+        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
+    else:
+        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
+            feats, backprop_feats, seen_mask, is_train, actions=actions,
+            max_moves=inputs.max_moves)
+
+
+def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
+                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+    cdef vector[StateC*] c_states
+    cdef StateClass state
+    for state in states:
+        if not state.is_final():
+            c_states.push_back(state.c)
+    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
+    # Precomputed features have rows for each token, plus one for padding.
+    cdef int n_tokens = feats.shape[0] - 1
+    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
+    cdef CBlas cblas = model.ops.cblas()
+    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
+
+    def backprop(dY):
+        raise ValueError(Errors.E4004)
+
+    return (states, scores), backprop
+
+cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
+                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
+    cdef int i, j
+    cdef vector[StateC *] unfinished
+    cdef ActivationsC activations = _alloc_activations(sizes)
+    cdef np.ndarray step_scores
+    cdef np.ndarray step_actions
+
+    scores = []
+    while sizes.states >= 1:
+        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
+        step_actions = actions[0] if actions is not None else None
+        with nogil:
+            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
+            if actions is None:
+                # Validate actions, argmax, take action.
+                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
+                    sizes.states)
+            else:
+                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
+            for i in range(sizes.states):
+                if not states[i].is_final():
+                    unfinished.push_back(states[i])
+            for i in range(unfinished.size()):
+                states[i] = unfinished[i]
+        sizes.states = unfinished.size()
+        scores.append(step_scores)
+        unfinished.clear()
+        actions = actions[1:] if actions is not None else None
+    _free_activations(&activations)
+
+    return scores
+
+
+def _forward_fallback(
+    model: Model,
+    moves: TransitionSystem,
+    states: List[StateClass],
+    tokvecs, backprop_tok2vec,
+    feats,
+    backprop_feats,
+    seen_mask,
+    is_train: bool,
+    actions: Optional[List[Ints1d]]=None,
+    max_moves: int=0):
+    nF = model.get_dim("nF")
+    output = model.get_ref("output")
+    hidden_b = model.get_param("hidden_b")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+
+    beam_width = model.attrs["beam_width"]
+    beam_density = model.attrs["beam_density"]
+
+    ops = model.ops
+
+    all_ids = []
+    all_which = []
+    all_statevecs = []
+    all_scores = []
+    if beam_width == 1:
+        batch = GreedyBatch(moves, states, None)
+    else:
+        batch = _beam_utils.BeamBatch(
+            moves, states, None, width=beam_width, density=beam_density
+        )
+    arange = ops.xp.arange(nF)
+    n_moves = 0
+    while not batch.is_done:
+        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
+        for i, state in enumerate(batch.get_unfinished_states()):
+            state.set_context_tokens(ids, i, nF)
+        # Sum the state features, add the bias and apply the activation (maxout)
+        # to create the state vectors.
+        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
+        preacts2f += hidden_b
+        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
+        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
+        statevecs, which = ops.maxout(preacts)
+        # We don't use output's backprop, since we want to backprop for
+        # all states at once, rather than a single state.
+        scores = output.predict(statevecs)
+        scores[:, seen_mask] = ops.xp.nanmin(scores)
+        # Transition the states, filtering out any that are finished.
+        cpu_scores = ops.to_numpy(scores)
+        if actions is None:
+            batch.advance(cpu_scores)
+        else:
+            batch.advance_with_actions(actions[0])
+            actions = actions[1:]
+        all_scores.append(scores)
+        if is_train:
+            # Remember intermediate results for the backprop.
+            all_ids.append(ids)
+            all_statevecs.append(statevecs)
+            all_which.append(which)
+        if n_moves >= max_moves >= 1:
+            break
+        n_moves += 1
+
+    def backprop_parser(d_states_d_scores):
+        ids = ops.xp.vstack(all_ids)
+        which = ops.xp.vstack(all_which)
+        statevecs = ops.xp.vstack(all_statevecs)
+        _, d_scores = d_states_d_scores
+        if model.attrs.get("unseen_classes"):
+            # If we have a negative gradient (i.e. the probability should
+            # increase) on any classes we filtered out as unseen, mark
+            # them as seen.
+            for clas in set(model.attrs["unseen_classes"]):
+                if (d_scores[:, clas] < 0).any():
+                    model.attrs["unseen_classes"].remove(clas)
+        d_scores *= seen_mask == False
+        # Calculate the gradients for the parameters of the output layer.
+        # The weight gemm is (nS, nO) @ (nS, nH).T
+        output.inc_grad("b", d_scores.sum(axis=0))
+        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
+        # Now calculate d_statevecs, by backproping through the output linear layer.
+        # This gemm is (nS, nO) @ (nO, nH)
+        output_W = output.get_param("W")
+        d_statevecs = ops.gemm(d_scores, output_W)
+        # Backprop through the maxout activation
+        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
+        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
+        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
+        # We don't need to backprop the summation, because we pass back the IDs instead
+        d_state_features = backprop_feats((d_preacts2f, ids))
+        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
+        ops.scatter_add(d_tokvecs, ids, d_state_features)
+        model.inc_grad("hidden_pad", d_tokvecs[-1])
+        return (backprop_tok2vec(d_tokvecs[:-1]), None)
+
+    return (list(batch), all_scores), backprop_parser
+
+
+def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
+    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
+    for class_ in model.attrs.get("unseen_classes", set()):
+        mask[class_] = True
+    return mask
+
+
+def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
+    W: Floats2d = model.get_param("hidden_W")
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
+    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
+    W3f = W3f.transpose((1, 0, 2))
+    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
+    assert X.shape == (X.shape[0], nI), X.shape
+    Yf_ = model.ops.gemm(X, W2f, trans2=True)
+    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
+
+    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nH, nP), and get back:
+        # (nB, nH, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nH, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        dXf = model.ops.gemm(dY, W)
+        Xf = X[ids].reshape((ids.shape[0], -1))
+        dW = model.ops.gemm(dY, Xf, trans1=True)
+        model.inc_grad("hidden_W", dW)
+        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
+
+    return Yf, backward
+
+
+def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
+    if Y is None:
+        return None
+    _, scores = Y
+    if len(scores) == 0:
+        return None
+    assert scores[0].shape[0] >= 1
+    assert len(scores[0].shape) == 2
+    return scores[0].shape[1]
+
+
+def _lsuv_init(model: Model):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    W = model.maybe_get_param("hidden_W")
+    if W is not None and W.any():
+        return
+
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nH, nP, nI)
+    b = model.ops.alloc2f(nH, nP)
+    pad = model.ops.alloc4f(1, nF, nH, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc_f((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc_f((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
+        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
+        vectors3f += b
+        return model.ops.maxout(vectors3f)[0]
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = cast(Floats4d, model.get_param("hidden_W").copy())
+    b = cast(Floats2d, model.get_param("hidden_b").copy())
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("hidden_W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("hidden_b", b)
+        else:
+            break
+    return model
+
+
+cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
+    output = model.get_ref("output")
+    cdef np.ndarray hidden_b = model.get_param("hidden_b")
+    cdef np.ndarray output_W = output.get_param("W")
+    cdef np.ndarray output_b = output.get_param("b")
+
+    cdef WeightsC weights
+    weights.feat_weights = feats
+    weights.feat_bias = <const float*>hidden_b.data
+    weights.hidden_weights = <const float *> output_W.data
+    weights.hidden_bias = <const float *> output_b.data
+    weights.seen_mask = <const int8_t*> seen_mask.data
+
+    return weights
+
+
+cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
+    cdef SizesC sizes
+    sizes.states = batch_size
+    sizes.classes = model.get_dim("nO")
+    sizes.hiddens = model.get_dim("nH")
+    sizes.pieces = model.get_dim("nP")
+    sizes.feats = model.get_dim("nF")
+    sizes.embed_width = model.get_dim("nI")
+    sizes.tokens = tokens
+    return sizes
+
+
+cdef ActivationsC _alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    _resize_activations(&A, n)
+    return A
+
+
+cdef void _free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
+cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
+    if n.states <= A._max_size:
+        A._curr_size = n.states
+        return
+    if A._max_size == 0:
+        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
+        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    else:
+        A.token_ids = <int*>realloc(A.token_ids,
+            n.states * n.feats * sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,
+            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>realloc(A.hiddens,
+            n.states * n.hiddens * sizeof(A.hiddens[0]))
+        A.is_valid = <int*>realloc(A.is_valid,
+            n.states * n.classes * sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    A._curr_size = n.states
+
+
+cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
+    _resize_activations(A, n)
+    for i in range(n.states):
+        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
+    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
+    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
+    for i in range(n.states):
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        for j in range(n.hiddens):
+            index = i * n.hiddens * n.pieces + j * n.pieces
+            which = arg_max(&A.unmaxed[index], n.pieces)
+            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
+    if W.hidden_weights == NULL:
+        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
+    else:
+        # Compute hidden-to-output
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
+                      1.0, <const float *>A.hiddens, n.hiddens,
+                      <const float *>W.hidden_weights, n.hiddens,
+                      0.0, scores, n.classes)
+        # Add bias
+        for i in range(n.states):
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
+    # Set unseen classes to minimum value
+    i = 0
+    min_ = scores[0]
+    for i in range(1, n.states * n.classes):
+        if scores[i] < min_:
+            min_ = scores[i]
+    for i in range(n.states):
+        for j in range(n.classes):
+            if W.seen_mask[j]:
+                scores[i*n.classes+j] = min_
+
+
+cdef void _sum_state_features(CBlas cblas, float* output,
+        const float* cached, const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    cdef int B = n.states
+    cdef int O = n.hiddens * n.pieces
+    cdef int F = n.feats
+    cdef int T = n.tokens
+    padding = cached + (T * F * O)
+    cdef int id_stride = F*O
+    cdef float one = 1.
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                feature = &padding[f*O]
+            else:
+                idx = token_ids[f] * id_stride + f*O
+                feature = &cached[idx]
+            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
+        token_ids += F
+
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pxd b/spacy/pipeline/_parser_internals/_parser_utils.pxd
new file mode 100644
index 00000000000..7fee05bad60
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pxd
@@ -0,0 +1,2 @@
+cdef int arg_max(const float* scores, const int n_classes) nogil
+cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pyx b/spacy/pipeline/_parser_internals/_parser_utils.pyx
new file mode 100644
index 00000000000..582756bf5be
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pyx
@@ -0,0 +1,22 @@
+# cython: infer_types=True
+
+cdef inline int arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
+cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if is_valid[i] >= 1:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index c063cf97cd4..1b6b25e5f16 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -8,6 +8,7 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
@@ -27,7 +28,7 @@ cdef struct ArcC:
 
 
 cdef cppclass StateC:
-    int* _heads
+    vector[int] _heads
     const TokenC* _sent
     vector[int] _stack
     vector[int] _rebuffer
@@ -35,31 +36,34 @@ cdef cppclass StateC:
     unordered_map[int, vector[ArcC]] _left_arcs
     unordered_map[int, vector[ArcC]] _right_arcs
     vector[libcpp.bool] _unshiftable
+    vector[int] history
     set[int] _sent_starts
     TokenC _empty_token
     int length
     int offset
     int _b_i
 
-    __init__(const TokenC* sent, int length) nogil:
+    __init__(const TokenC* sent, int length) nogil except +:
+        this._heads.resize(length, -1)
+        this._unshiftable.resize(length, False)
+
+        # Reserve memory ahead of time to minimize allocations during parsing.
+        # The initial capacity set here ideally reflects the expected average-case/majority usage.
+        cdef int init_capacity = 32
+        this._stack.reserve(init_capacity)
+        this._rebuffer.reserve(init_capacity)
+        this._ents.reserve(init_capacity)
+        this._left_arcs.reserve(init_capacity)
+        this._right_arcs.reserve(init_capacity)
+        this.history.reserve(init_capacity)
+
         this._sent = sent
-        this._heads = <int*>calloc(length, sizeof(int))
-        if not (this._sent and this._heads):
-            with gil:
-                PyErr_SetFromErrno(MemoryError)
-                PyErr_CheckSignals()
         this.offset = 0
         this.length = length
         this._b_i = 0
-        for i in range(length):
-            this._heads[i] = -1
-            this._unshiftable.push_back(0)
         memset(&this._empty_token, 0, sizeof(TokenC))
         this._empty_token.lex = &EMPTY_LEXEME
 
-    __dealloc__():
-        free(this._heads)
-
     void set_context_tokens(int* ids, int n) nogil:
         cdef int i, j
         if n == 1:
@@ -132,19 +136,20 @@ cdef cppclass StateC:
                 ids[i] = -1
 
     int S(int i) nogil const:
-        if i >= this._stack.size():
+        cdef int stack_size = this._stack.size()
+        if i >= stack_size or i < 0:
             return -1
-        elif i < 0:
-            return -1
-        return this._stack.at(this._stack.size() - (i+1))
+        else:
+            return this._stack[stack_size - (i+1)]
 
     int B(int i) nogil const:
+        cdef int buf_size = this._rebuffer.size()
         if i < 0:
             return -1
-        elif i < this._rebuffer.size():
-            return this._rebuffer.at(this._rebuffer.size() - (i+1))
+        elif i < buf_size:
+            return this._rebuffer[buf_size - (i+1)]
         else:
-            b_i = this._b_i + (i - this._rebuffer.size())
+            b_i = this._b_i + (i - buf_size)
             if b_i >= this.length:
                 return -1
             else:
@@ -243,7 +248,7 @@ cdef cppclass StateC:
             return 0
         elif this._sent[word].sent_start == 1:
             return 1
-        elif this._sent_starts.count(word) >= 1:
+        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
             return 1
         else:
             return 0
@@ -327,7 +332,7 @@ cdef cppclass StateC:
         if item >= this._unshiftable.size():
             return 0
         else:
-            return this._unshiftable.at(item)
+            return this._unshiftable[item]
 
     void set_reshiftable(int item) nogil:
         if item < this._unshiftable.size():
@@ -347,6 +352,9 @@ cdef cppclass StateC:
         this._heads[child] = head
 
     void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
+        cdef vector[ArcC]* arcs
+        cdef ArcC* arc
+
         arcs_it = heads_arcs.find(h_i)
         if arcs_it == heads_arcs.end():
             return
@@ -355,12 +363,12 @@ cdef cppclass StateC:
         if arcs.size() == 0:
             return
 
-        arc = arcs.back()
+        arc = &arcs.back()
         if arc.head == h_i and arc.child == c_i:
             arcs.pop_back()
         else:
             for i in range(arcs.size()-1):
-                arc = arcs.at(i)
+                arc = &deref(arcs)[i]
                 if arc.head == h_i and arc.child == c_i:
                     arc.head = -1
                     arc.child = -1
@@ -400,10 +408,11 @@ cdef cppclass StateC:
         this._rebuffer = src._rebuffer
         this._sent_starts = src._sent_starts
         this._unshiftable = src._unshiftable
-        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
+        this._heads = src._heads
         this._ents = src._ents
         this._left_arcs = src._left_arcs
         this._right_arcs = src._right_arcs
         this._b_i = src._b_i
         this.offset = src.offset
         this._empty_token = src._empty_token
+        this.history = src.history
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 9dda3bd5e44..462aa820e4f 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -779,6 +779,8 @@ cdef class ArcEager(TransitionSystem):
         return list(arcs)
 
     def has_gold(self, Example eg, start=0, end=None):
+        if end is not None and end < 0:
+            end = None
         for word in eg.y[start:end]:
             if word.dep != 0:
                 return True
@@ -863,6 +865,7 @@ cdef class ArcEager(TransitionSystem):
                             state.print_state()
                         )))
                     action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                     break
             else:
                 failed = False
diff --git a/spacy/pipeline/_parser_internals/batch.pxd b/spacy/pipeline/_parser_internals/batch.pxd
new file mode 100644
index 00000000000..60734e549aa
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/batch.pxd
@@ -0,0 +1,2 @@
+cdef class Batch:
+    pass
diff --git a/spacy/pipeline/_parser_internals/batch.pyx b/spacy/pipeline/_parser_internals/batch.pyx
new file mode 100644
index 00000000000..91073b52e68
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/batch.pyx
@@ -0,0 +1,52 @@
+from typing import Any
+
+TransitionSystem = Any  # TODO
+
+cdef class Batch:
+    def advance(self, scores):
+        raise NotImplementedError
+
+    def get_states(self):
+        raise NotImplementedError
+
+    @property
+    def is_done(self):
+        raise NotImplementedError
+
+    def get_unfinished_states(self):
+        raise NotImplementedError
+
+    def __getitem__(self, i):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+
+class GreedyBatch(Batch):
+    def __init__(self, moves: TransitionSystem, states, golds):
+        self._moves = moves
+        self._states = states
+        self._next_states = [s for s in states if not s.is_final()]
+
+    def advance(self, scores):
+        self._next_states = self._moves.transition_states(self._next_states, scores)
+
+    def advance_with_actions(self, actions):
+        self._next_states = self._moves.apply_actions(self._next_states, actions)
+
+    def get_states(self):
+        return self._states
+
+    @property
+    def is_done(self):
+        return all(s.is_final() for s in self._states)
+
+    def get_unfinished_states(self):
+        return [st for st in self._states if not st.is_final()]
+
+    def __getitem__(self, i):
+        return self._states[i]
+
+    def __len__(self):
+        return len(self._states)
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index ab522b1db79..dedf806db91 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -317,6 +317,8 @@ cdef class BiluoPushDown(TransitionSystem):
             for span in eg.y.spans.get(neg_key, []):
                 if span.start >= start and span.end <= end:
                     return True
+        if end is not None and end < 0:
+            end = None
         for word in eg.y[start:end]:
             if word.ent_iob != 0:
                 return True
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index f25408a13ba..c2a0d22956a 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -21,6 +21,10 @@ cdef class StateClass:
         if self._borrowed != 1:
             del self.c
 
+    @property
+    def history(self):
+        return list(self.c.history)
+
     @property
     def stack(self):
         return [self.S(i) for i in range(self.c.stack_depth())]
@@ -177,3 +181,6 @@ cdef class StateClass:
 
     def clone(self, StateClass src):
         self.c.clone(src.c)
+
+    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
+        self.c.set_context_tokens(&output[row, 0], n_feats)
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 04cd10d8864..66cc7747b69 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -57,3 +57,10 @@ cdef class TransitionSystem:
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        const StateC* state, gold) except -1
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+        int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index 4a0feb435dd..7bd39ba43c5 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -3,6 +3,8 @@
 from __future__ import print_function
 
 from cymem.cymem cimport Pool
+from libc.stdlib cimport calloc, free
+from libcpp.vector cimport vector
 
 from collections import Counter
 
@@ -74,7 +76,18 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
+    def follow_history(self, doc, history):
+        cdef int clas
+        cdef StateClass state = StateClass(doc)
+        for clas in history:
+            action = self.c[clas]
+            action.do(state.c, action.label)
+            state.c.history.push_back(clas)
+        return state
+
     def get_oracle_sequence(self, Example example, _debug=False):
+        if not self.has_gold(example):
+            return []
         states, golds, _ = self.init_gold_batch([example])
         if not states:
             return []
@@ -86,6 +99,8 @@ cdef class TransitionSystem:
             return self.get_oracle_sequence_from_state(state, gold)
 
     def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
+        if state.is_final():
+            return []
         cdef Pool mem = Pool()
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         assert self.n_moves > 0
@@ -111,6 +126,7 @@ cdef class TransitionSystem:
                             "S0 head?", str(state.has_head(state.S(0))),
                         )))
                     action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                     break
             else:
                 if _debug:
@@ -138,6 +154,28 @@ cdef class TransitionSystem:
             raise ValueError(Errors.E170.format(name=name))
         action = self.lookup_transition(name)
         action.do(state.c, action.label)
+        state.c.history.push_back(action.clas)
+
+    def apply_actions(self, states, const int[::1] actions):
+        assert len(states) == actions.shape[0]
+        cdef StateClass state
+        cdef vector[StateC*] c_states
+        c_states.resize(len(states))
+        cdef int i
+        for (i, state) in enumerate(states):
+            c_states[i] = state.c
+        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
+        return [state for state in states if not state.c.is_final()]
+
+    def transition_states(self, states, float[:, ::1] scores):
+        assert len(states) == scores.shape[0]
+        cdef StateClass state
+        cdef float* c_scores = &scores[0, 0]
+        cdef vector[StateC*] c_states
+        for state in states:
+            c_states.push_back(state.c)
+        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
+        return [state for state in states if not state.c.is_final()]
 
     cdef Transition lookup_transition(self, object name) except *:
         raise NotImplementedError
@@ -250,3 +288,35 @@ cdef class TransitionSystem:
             self.cfg.update(msg['cfg'])
         self.initialize_actions(labels)
         return self
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil:
+        cdef int i
+        cdef Transition action
+        cdef StateC* state
+        for i in range(batch_size):
+            state = states[i]
+            action = moves.c[actions[i]]
+            action.do(state, action.label)
+            state.history.push_back(action.clas)
+
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+    int nr_class, int batch_size) nogil:
+    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
+    cdef int i, guess
+    cdef Transition action
+    for i in range(batch_size):
+        moves.set_valid(is_valid, states[i])
+        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
+        if guess == -1:
+            # This shouldn't happen, but it's hard to raise an error here,
+            # and we don't want to infinite loop. So, force to end state.
+            states[i].force_final()
+        else:
+            action = moves.c[guess]
+            action.do(states[i], action.label)
+            states[i].history.push_back(guess)
+    free(is_valid)
+
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.py
similarity index 97%
rename from spacy/pipeline/dep_parser.pyx
rename to spacy/pipeline/dep_parser.py
index cbd7187ff0f..c996074d2c4 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.py
@@ -5,6 +5,8 @@
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
+from ._parser_internals.arc_eager import ArcEager
 
 from ._parser_internals.arc_eager cimport ArcEager
 from .transition_parser cimport Parser
@@ -22,12 +24,11 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -233,6 +234,7 @@ def parser_score(examples, **kwargs):
 
     DOCS: https://spacy.io/api/dependencyparser#score
     """
+
     def has_sents(doc):
         return doc.has_annotation("SENT_START")
 
@@ -240,8 +242,11 @@ def dep_getter(token, attr):
         dep = getattr(token, attr)
         dep = token.vocab.strings.as_string(dep).lower()
         return dep
+
     results = {}
-    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
+    results.update(
+        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
+    )
     kwargs.setdefault("getter", dep_getter)
     kwargs.setdefault("ignore_labels", ("p", "punct"))
     results.update(Scorer.score_deps(examples, "dep", **kwargs))
@@ -254,11 +259,12 @@ def make_parser_scorer():
     return parser_score
 
 
-cdef class DependencyParser(Parser):
+class DependencyParser(Parser):
     """Pipeline component for dependency parsing.
 
     DOCS: https://spacy.io/api/dependencyparser
     """
+
     TransitionSystem = ArcEager
 
     def __init__(
@@ -278,8 +284,7 @@ def __init__(
         incorrect_spans_key=None,
         scorer=parser_score,
     ):
-        """Create a DependencyParser.
-        """
+        """Create a DependencyParser."""
         super().__init__(
             vocab,
             model,
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.py
similarity index 93%
rename from spacy/pipeline/ner.pyx
rename to spacy/pipeline/ner.py
index fe54d33a17b..41280c49390 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.py
@@ -10,6 +10,13 @@
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
+from ._parser_internals.ner import BiluoPushDown
+from ..language import Language
+from ..scorer import get_ner_prf, PRFScore
+from ..training import validate_examples
+from ..util import registry
+from ..training import remove_bilu_prefix
 
 from ._parser_internals.ner cimport BiluoPushDown
 from .transition_parser cimport Parser
@@ -21,12 +28,11 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -51,8 +57,12 @@
         "incorrect_spans_key": None,
         "scorer": {"@scorers": "spacy.ner_scorer.v1"},
     },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
-
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_ner(
     nlp: Language,
@@ -119,7 +129,12 @@ def make_ner(
         "incorrect_spans_key": None,
         "scorer": None,
     },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_beam_ner(
     nlp: Language,
@@ -193,11 +208,12 @@ def make_ner_scorer():
     return ner_score
 
 
-cdef class EntityRecognizer(Parser):
+class EntityRecognizer(Parser):
     """Pipeline component for named entity recognition.
 
     DOCS: https://spacy.io/api/entityrecognizer
     """
+
     TransitionSystem = BiluoPushDown
 
     def __init__(
@@ -215,15 +231,14 @@ def __init__(
         incorrect_spans_key=None,
         scorer=ner_score,
     ):
-        """Create an EntityRecognizer.
-        """
+        """Create an EntityRecognizer."""
         super().__init__(
             vocab,
             model,
             name,
             moves,
             update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,   # not relevant for NER
+            min_action_freq=1,  # not relevant for NER
             learn_tokens=False,  # not relevant for NER
             beam_width=beam_width,
             beam_density=beam_density,
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 7c3a9d56249..d9cbf5e8c72 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,6 +17,8 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
+from thinc.api import fix_random_seed
+import logging
 
 from ..util import make_tempdir
 
@@ -413,7 +415,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -540,11 +542,11 @@ def test_block_ner():
     assert [token.ent_type_ for token in doc] == expected_types
 
 
-@pytest.mark.parametrize("use_upper", [True, False])
-def test_overfitting_IO(use_upper):
+def test_overfitting_IO():
+    fix_random_seed(1)
     # Simple test to try and quickly overfit the NER component
     nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
+    ner = nlp.add_pipe("ner", config={"model": {}})
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -576,7 +578,6 @@ def test_overfitting_IO(use_upper):
         assert ents2[0].label_ == "LOC"
         # Ensure that the predictions are still the same, even after adding a new label
         ner2 = nlp2.get_pipe("ner")
-        assert ner2.model.attrs["has_upper"] == use_upper
         ner2.add_label("RANDOM_NEW_LABEL")
         doc3 = nlp2(test_text)
         ents3 = doc3.ents
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index f63d56f6922..9eccb056ce2 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,16 +1,17 @@
+import itertools
 import pytest
+import numpy
 from numpy.testing import assert_equal
 from thinc.api import Adam, fix_random_seed
 
 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.pipeline import DependencyParser
-from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
-from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
-from spacy.tokens import Doc
 from spacy.training import Example
+from spacy.tokens import Doc
 from spacy.vocab import Vocab
+from spacy import util, registry
+from thinc.api import fix_random_seed
 
 from ..util import apply_transition_sequence, make_tempdir
 
@@ -59,6 +60,8 @@
     ),
 ]
 
+PARSERS = ["parser"]  # TODO: Test beam_parser when ready
+
 eps = 0.1
 
 
@@ -171,6 +174,57 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
     assert doc[0].dep != 0
 
 
+def test_parser_apply_actions(en_vocab, en_parser):
+    words = ["I", "ate", "pizza"]
+    words2 = ["Eat", "more", "pizza", "!"]
+    doc1 = Doc(en_vocab, words=words)
+    doc2 = Doc(en_vocab, words=words2)
+    docs = [doc1, doc2]
+
+    moves = en_parser.moves
+    moves.add_action(0, "")
+    moves.add_action(1, "")
+    moves.add_action(2, "nsubj")
+    moves.add_action(3, "obj")
+    moves.add_action(2, "amod")
+
+    actions = [
+        numpy.array([0, 0], dtype="i"),
+        numpy.array([2, 0], dtype="i"),
+        numpy.array([0, 4], dtype="i"),
+        numpy.array([3, 3], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([0], dtype="i"),
+        numpy.array([1], dtype="i"),
+    ]
+
+    states = moves.init_batch(docs)
+    active_states = states
+
+    for step_actions in actions:
+        active_states = moves.apply_actions(active_states, step_actions)
+
+    assert len(active_states) == 0
+
+    for (state, doc) in zip(states, docs):
+        moves.set_annotations(state, doc)
+
+    assert docs[0][0].head.i == 1
+    assert docs[0][0].dep_ == "nsubj"
+    assert docs[0][1].head.i == 1
+    assert docs[0][1].dep_ == "ROOT"
+    assert docs[0][2].head.i == 1
+    assert docs[0][2].dep_ == "obj"
+
+    assert docs[1][0].head.i == 0
+    assert docs[1][0].dep_ == "ROOT"
+    assert docs[1][1].head.i == 2
+    assert docs[1][1].dep_ == "amod"
+    assert docs[1][2].head.i == 0
+    assert docs[1][2].dep_ == "obj"
+
+
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
@@ -319,7 +373,7 @@ def test_parser_constructor(en_vocab):
     DependencyParser(en_vocab, model)
 
 
-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+@pytest.mark.parametrize("pipe_name", PARSERS)
 def test_incomplete_data(pipe_name):
     # Test that the parser works with incomplete information
     nlp = English()
@@ -345,11 +399,15 @@ def test_incomplete_data(pipe_name):
     assert doc[2].head.i == 1
 
 
-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
-def test_overfitting_IO(pipe_name):
+@pytest.mark.parametrize(
+    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
+)
+def test_overfitting_IO(pipe_name, max_moves):
+    fix_random_seed(0)
     # Simple test to try and quickly overfit the dependency parser (normal or beam)
     nlp = English()
     parser = nlp.add_pipe(pipe_name)
+    parser.cfg["update_with_oracle_cut_size"] = max_moves
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -455,10 +513,12 @@ def test_distill(max_moves):
 @pytest.mark.parametrize(
     "parser_config",
     [
-        # TransitionBasedParser V1
-        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        # TransitionBasedParser V2
+        # TODO: re-enable after we have a spacy-legacy release for v4. See
+        # https://github.com/explosion/spacy-legacy/pull/36
+        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
         ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
+        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
     ],
 )
 # fmt: on
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index c3c4bb6c686..f6cefbc1f84 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -384,7 +384,7 @@ def test_replace_listeners():
     factory = "ner"
 
     [components.ner.model]
-    @architectures = "spacy.TransitionBasedParser.v2"
+    @architectures = "spacy.TransitionBasedParser.v3"
 
     [components.ner.model.tok2vec]
     @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 8a1c74ca9ed..b351ea80121 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -189,33 +189,11 @@
 
 parser_config_string_upper = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
-use_upper = true
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = null
-width = 333
-depth = 4
-embed_size = 5555
-window_size = 1
-maxout_pieces = 7
-subword_features = false
-"""
-
-
-parser_config_string_no_upper = """
-[model]
-@architectures = "spacy.TransitionBasedParser.v2"
-state_type = "parser"
-extra_state_tokens = false
-hidden_width = 66
-maxout_pieces = 2
-use_upper = false
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
@@ -246,7 +224,6 @@ def my_parser():
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
-        use_upper=True,
     )
     return parser
 
@@ -360,15 +337,16 @@ def test_serialize_custom_nlp():
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        assert model.get_ref("upper").get_dim("nI") == 65
-        assert model.get_ref("lower").get_dim("nI") == 65
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("W")
+        assert output.has_param("b")
 
 
-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_serialize_parser(parser_config_string):
     """Create a non-default parser config to check nlp serializes it correctly"""
     nlp = English()
@@ -381,11 +359,13 @@ def test_serialize_parser(parser_config_string):
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        if model.attrs["has_upper"]:
-            assert model.get_ref("upper").get_dim("nI") == 66
-        assert model.get_ref("lower").get_dim("nI") == 66
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("b")
+        assert output.has_param("W")
 
 
 def test_config_nlp_roundtrip():
@@ -581,9 +561,7 @@ def test_config_auto_fill_extra_fields():
     load_model_from_config(nlp.config)
 
 
-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_config_validate_literal(parser_config_string):
     nlp = English()
     config = Config().from_str(parser_config_string)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index d2a41ff0fed..160e2ecf335 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,24 +1,13 @@
 import ctypes
 import os
 from pathlib import Path
-
-import pytest
-
-try:
-    from pydantic.v1 import ValidationError
-except ImportError:
-    from pydantic import ValidationError  # type: ignore
-
-from thinc.api import (
-    Config,
-    ConfigValidationError,
-    CupyOps,
-    MPSOps,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-    set_current_ops,
-)
+from spacy.about import __version__ as spacy_version
+from spacy import util
+from spacy import prefer_gpu, require_gpu, require_cpu
+from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
+from spacy.util import find_available_port
+from thinc.api import Config, Optimizer, ConfigValidationError
+from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
@@ -101,34 +90,6 @@ def test_util_get_package_path(package):
     assert isinstance(path, Path)
 
 
-def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
-    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
-    assert model.get_param("W").shape == (nF, nO, nP, nI)
-    tensor = model.ops.alloc((10, nI))
-    Y, get_dX = model.begin_update(tensor)
-    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
-    dY = model.ops.alloc((15, nO, nP))
-    ids = model.ops.alloc((15, nF))
-    ids[1, 2] = -1
-    dY[1] = 1
-    assert not model.has_grad("pad")
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 2, 0, 0] == 1.0
-    ids.fill(0.0)
-    dY.fill(0.0)
-    dY[0] = 0
-    ids[1, 2] = 0
-    ids[1, 1] = -1
-    ids[1, 0] = -1
-    dY[1] = 1
-    ids[2, 0] = -1
-    dY[2] = 5
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 0, 0, 0] == 6
-    assert d_pad[0, 1, 0, 0] == 1
-    assert d_pad[0, 2, 0, 0] == 0
-
-
 def test_prefer_gpu():
     current_ops = get_current_ops()
     if has_cupy_gpu:
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 956234ac0d4..db8f974ea19 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -833,18 +833,17 @@ for a Tok2Vec layer.
 
 ## Parser & NER architectures {id="parser"}
 
-### spacy.TransitionBasedParser.v2 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
+### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TransitionBasedParser.v2"
+> @architectures = "spacy.TransitionBasedParser.v3"
 > state_type = "ner"
 > extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
-> use_upper = true
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v2"
@@ -874,23 +873,22 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
-| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
-| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
-| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
-| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
-| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
-| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
+| Name                 | Description                                                                                                                                                       |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                        |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                               |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                            |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~                                                             |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                       |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                     |
 
 <Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>
 
 [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
 same signature, but the `use_upper` argument was `True` by default.
 
-</Accordion>
+ </Accordion>
 
 ## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx
index b44df538766..44c80622437 100644
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@@ -302,7 +302,7 @@ the others, but may not be as accurate, especially if texts are short.
 ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 
 Identical to
-[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
+[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser)
 except the `use_upper` was set to `True` by default.
 
 ## Layers {id="layers"}

From 52dc989ae3a27691a34674540c4b3e5315a6f810 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 18 Jan 2023 18:28:30 +0100
Subject: [PATCH 448/504] Fix batching regression (#12094)

* Fix batching regression

Some time ago, the spaCy v4 branch switched to the new Thinc v9
schedule. However, this introduced an error in how batching is handed.

In the PR, the batchers were changed to keep track of their step,
so that the step can be passed to the schedule. However, the issue
is that the training loop repeatedly calls the batching functions
(rather than using an infinite generator/iterator). So, the step and
therefore the schedule would be reset each epoch. Before the schedule
switch we didn't have this issue, because the old schedules were
stateful.

This PR fixes this issue by reverting the batching functions to use
a (stateful) generator. Their registry functions do accept a `Schedule`
and we convert `Schedule`s to generators.

* Update batcher docs

* Docstring fixes

* Make minibatch take iterables again as well

* Bump thinc requirement to 9.0.0.dev2

* Use type declaration

* Convert another comment into a proper type declaration
---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 spacy/util.py    | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 58e4382e829..3891d137867 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev1,<9.1.0",
+    "thinc>=9.0.0.dev2,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 5c889c91f81..3a1ef6b70b4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev1,<9.1.0
+thinc>=9.0.0.dev2,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index 9d4ccc7a4b3..d1a4e51353e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,7 +38,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=9.0.0.dev1,<9.1.0
+    thinc>=9.0.0.dev2,<9.1.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/util.py b/spacy/util.py
index a76e8f73eeb..20d7cbb5726 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1622,12 +1622,12 @@ def minibatch(items, size):
     so that batch-size can vary on each step.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_ = itertools.repeat(size)
     else:
         size_ = iter(size)
     items = iter(items)
-    for step in itertools.count():
-        batch_size = size_(step)
+    while True:
+        batch_size = next(size_)
         batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break

From 946b289ccbb17ffa78d1d2ca441dec9a5bc27123 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 15:48:20 +0100
Subject: [PATCH 449/504] Drop python 3.6/3.7, remove unneeded compat (#12187)

* Drop python 3.6/3.7, remove unneeded compat

* Remove unused import

* Minimal python 3.8+ docs updates
---
 spacy/cli/_util.py      | 10 ++++++++++
 spacy/cli/debug_data.py |  8 ++++++++
 spacy/schemas.py        |  9 +++++++++
 spacy/util.py           |  2 +-
 4 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index eed61119070..977912443bd 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,3 +1,10 @@
+from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
+from typing import TYPE_CHECKING, overload
+import sys
+import shutil
+from pathlib import Path
+from wasabi import msg, Printer
+import srsly
 import hashlib
 import os
 import shutil
@@ -27,6 +34,9 @@
 from typer.main import get_command
 from wasabi import Printer, msg
 
+from ..schemas import ProjectConfigSchema, validate
+from ..util import import_file, run_command, make_tempdir, registry, logger
+from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
 from .. import about
 from ..errors import RENAMED_LANGUAGE_CODES
 from ..schemas import ProjectConfigSchema, validate
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 7a98e6d563c..60f760ccb52 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,3 +1,11 @@
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
+from typing import Literal, cast, overload
+from pathlib import Path
+from collections import Counter
+import sys
+import srsly
+from wasabi import Printer, MESSAGES, msg
+import typer
 import math
 import sys
 from collections import Counter
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 6b41bb5b2b7..cf9d3064065 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,3 +1,12 @@
+from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
+from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
+from enum import Enum
+from pydantic import BaseModel, Field, ValidationError, validator, create_model
+from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
+from pydantic.main import ModelMetaclass
+from thinc.api import Optimizer, ConfigValidationError, Model
+from thinc.config import Promise
+from collections import defaultdict
 import inspect
 import re
 from collections import defaultdict
diff --git a/spacy/util.py b/spacy/util.py
index 20d7cbb5726..ae1135234e3 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -65,7 +65,7 @@
 
 
 from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows, importlib_metadata
+from .compat import cupy, CudaStream, is_windows
 from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, is_windows

From ba4aef7bb67c2f7693c010ed07b7eb574ba53d04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 30 Jan 2023 12:44:11 +0100
Subject: [PATCH 450/504] Add `Language.distill` (#12116)

* Add `Language.distill`

This method is the distillation counterpart of `Language.update`.  It
takes a teacher `Language` instance and distills the student pipes on
the teacher pipes.

* Apply suggestions from code review

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Clarify that how Example is used in distillation

* Update transition parser distill docstring for examples argument

* Pass optimizer to `TrainablePipe.distill`

* Annotate pipe before update

As discussed internally, we want to let a pipe annotate before doing an
update with gold/silver data. Otherwise, the output may be (too)
informed by the gold/silver data.

* Rename `component_map` to `student_to_teacher`

* Better synopsis in `Language.distill` docstring

* `name` -> `student_name`

* Fix labels type in docstring

* Mark distill test as slow

* Fix `student_to_teacher` type in docs

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/tests/test_language.py  |  6 ++++++
 website/docs/api/language.mdx | 28 ++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 6ed0f44eab9..941edf0fedc 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -85,6 +85,12 @@
 ]
 
 
+TAGGER_TRAIN_DATA = [
+    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
+
 def evil_component(doc):
     if "2" in doc.text:
         raise ValueError("no dice")
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index f3fad41314e..76743d84f9d 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -389,6 +389,34 @@ Distill the models in a student pipeline from a teacher pipeline.
 | `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                              |
 | **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
+## Language.distill {id="distill",tag="method,experimental",version="4"}
+
+Distill the models in a student pipeline from a teacher pipeline.
+
+> #### Example
+>
+> ```python
+>
+> teacher = spacy.load("en_core_web_lg")
+> student = English()
+> student.add_pipe("tagger")
+> student.distill(teacher, examples, sgd=optimizer)
+> ```
+
+| Name                 | Description                                                                                                                                                                                 |
+| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher`            | The teacher pipeline to distill from. ~~Language~~                                                                                                                                          |
+| `examples`           | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_       |                                                                                                                                                                                             |
+| `drop`               | The dropout rate. ~~float~~                                                                                                                                                                 |
+| `sgd`                | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`             | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                                                             |
+| `component_cfg`      | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                              |
+| `exclude`            | Names of components that shouldn't be updated. Defaults to `[]`. ~~Iterable[str]~~                                                                                                          |
+| `annotates`          | Names of components that should set annotations on the prediced examples after updating. Defaults to `[]`. ~~Iterable[str]~~                                                                |
+| `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                              |
+| **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+
 ## Language.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the

From f50a11f8d8ca28e8a41f60f069c9fcba5ebc14d9 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 31 Jan 2023 19:31:17 +0900
Subject: [PATCH 451/504] Don't re-download installed models (#12188)

* Don't re-download installed models

When downloading a model, this checks if the same version of the same
model is already installed. If it is then the download is skipped.

This is necessary because pip uses the final download URL for its
caching feature, but because of the way models are hosted on Github,
their URLs change every few minutes.

* Use importlib instead of meta.json

* Use get_package_version

* Add untested, disabled test

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/download.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0b8ed54ed3c..ea7a06c5417 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -8,7 +8,8 @@
 
 from .. import about
 from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version
+from ..util import is_prerelease_version, get_installed_models
+from ..util import get_package_version
 
 
 @app.command(

From a713297a0f9c902d9c6c7d6689cab1d2f71289f0 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Tue, 31 Jan 2023 17:30:43 +0100
Subject: [PATCH 452/504] Rename language codes (Icelandic, multi-language)
 (#12149)

* Init

* fix tests

* Update spacy/errors.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Fix test_blank_languages

* Rename xx to mul in docs

* Format _util with black

* prettier formatting

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/_util.py   | 1 +
 spacy/cli/convert.py | 2 ++
 spacy/errors.py      | 1 +
 3 files changed, 4 insertions(+)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 977912443bd..644f3e5ef24 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -37,6 +37,7 @@
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
 from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
+from ..errors import RENAMED_LANGUAGE_CODES
 from .. import about
 from ..errors import RENAMED_LANGUAGE_CODES
 from ..schemas import ProjectConfigSchema, validate
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index a282e59c749..19591a05c94 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -8,6 +8,8 @@
 import srsly
 from wasabi import Printer
 
+from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
+from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training import docs_to_json
 from ..training.converters import (
diff --git a/spacy/errors.py b/spacy/errors.py
index 22532756e0c..b814218815f 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -981,6 +981,7 @@ class Errors(metaclass=ErrorsWithCodes):
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")
 
+RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 
 # fmt: on
 

From 07e2ed31e2b3c1ffe977dfc4514363084b40b66b Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 1 Feb 2023 17:47:56 +0900
Subject: [PATCH 453/504] Move Entity Linker v1 to spacy-legacy (#12006)

* Move Entity Linker v1 component to spacy-legacy

This is a follow up to #11889 that moves the component instead of
removing it.

In general, we never import from spacy-legacy in spaCy proper. However,
to use this component, that kind of import will be necessary. I was able
to test this without issues, but is this current import strategy
acceptable? Or should we put the component in a registry?

* Use spacy-legacy pr for CI

This will need to be reverted before merging.

* Add temporary step to log installed spacy-legacy version

* Modify requirements.txt to trigger tests

* Add comment to Python to trigger tests

* TODO REVERT This is a commit with logic changes to trigger tests

* Remove pipe from YAML

Works locally, but possibly this is causing a quoting error or
something.

* Revert "TODO REVERT This is a commit with logic changes to trigger tests"

This reverts commit 689fae71f31de4f54a00dd7dae0c26b19563c027.

* Revert "Add comment to Python to trigger tests"

This reverts commit 11840fc59886658c59aeb186a20173f5ec7c4583.

* Add more logging

* Try installing directly in workflow

* Try explicitly uninstalling spacy-legacy first

* Cat requirements.txt to confirm contents

In the branch, the thinc version spec is `thinc>=8.1.0,<8.2.0`. But in
the logs, it's clear that a development release of 9.0 is being
installed. It's not clear why that would happen.

* Log requirements at start of build

* TODO REVERT Change thinc spec

Want to see what happens to the installed thinc spec with this change.

* Update thinc requirements

This makes it the same as it was before the merge, >=8.1.0,<8.2.0.

* Use same thinc version as v4 branch

* TODO REVERT Mark dependency check as xfail

spacy-legacy is specified as a git checkout in requirements.txt while
this PR is in progress, which makes the consistency check here fail.

* Remove debugging output / install step

* Revert "Remove debugging output / install step"

This reverts commit 923ea7448b5e819d73272bc4e43e8880a8598a07.

* Clean up debugging output

The manual install step with the URL fragment seems to have caused
issues on Windows due to the = in the URL being misinterpreted. On the
other hand, removing it seems to mean the git version of spacy-legacy
isn't actually installed.

This PR removes the URL fragment but keeps the direct command-line
install. Additionally, since it looks like this job is configured to use
the default shell (and not bash), it removes a comment that upsets the
Windows cmd shell.

* Revert "TODO REVERT Mark dependency check as xfail"

This reverts commit d4863ec1563b7819c31a865cb94262b7dc592b7e.

* Fix requirements.txt, increasing spacy-legacy version

* Raise spacy legacy version in setup.cfg

* Remove azure build workarounds

* make spacy-legacy version explicit in error message

* Remove debugging line

* Suggestions from code review
---
 spacy/tests/pipeline/test_entity_linker.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 32e7a265f37..33e8d47400e 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1090,6 +1090,8 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
+    from spacy_legacy.components.entity_linker import EntityLinker_v1
+
     # Ensure that the legacy architectures still work
     vector_length = 3
     nlp = English()

From f25877084915f24d4cf6edfe23304770f09fa4a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 21 Feb 2023 15:47:18 +0100
Subject: [PATCH 454/504] Reimplement distillation with oracle cut size
 (#12214)

* Improve the correctness of _parse_patch

* If there are no more actions, do not attempt to make further
  transitions, even if not all states are final.
* Assert that the number of actions for a step is the same as
  the number of states.

* Reimplement distillation with oracle cut size

The code for distillation with an oracle cut size was not reimplemented
after the parser refactor. We did not notice, because we did not have
tests for this functionality. This change brings back the functionality
and adds this to the parser tests.

* Rename states2actions to _states_to_actions for consistency

* Test distillation max cuts in NER

* Mark parser/NER tests as slow

* Typo

* Fix invariant in _states_diff_to_actions

* Rename _init_batch -> _init_batch_from_teacher

* Ninja edit the ninja edit

* Check that we raise an exception when we pass the incorrect number or actions

* Remove unnecessary get

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Write out condition more explicitly

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/ml/tb_framework.pyx            |   4 +-
 spacy/pipeline/transition_parser.pyx | 101 +++++++++++++++++++++------
 spacy/tests/parser/test_model.py     |  61 ++++++++++++++++
 3 files changed, 144 insertions(+), 22 deletions(-)
 create mode 100644 spacy/tests/parser/test_model.py

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 79be13b00bd..9b2114900d3 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
     cdef np.ndarray step_actions
 
     scores = []
-    while sizes.states >= 1:
+    while sizes.states >= 1 and (actions is None or len(actions) > 0):
         step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
         step_actions = actions[0] if actions is not None else None
+        assert step_actions is None or step_actions.size == sizes.states, \
+            f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
         with nogil:
             _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
             if actions is None:
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 632616db759..c1bc2c28da9 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -65,6 +65,11 @@ cdef extern from "<algorithm>" namespace "std" nogil:
 
 
 
+# TODO: Remove when we switch to Cython 3.
+cdef extern from "<algorithm>" namespace "std" nogil:
+    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
+
+
 NUMPY_OPS = NumpyOps()
 
 
@@ -283,8 +288,8 @@ class Parser(TrainablePipe):
             # batch uniform length. Since we do not have a gold standard
             # sequence, we use the teacher's predictions as the gold
             # standard.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states = self._init_batch(teacher_pipe, student_docs, max_moves)
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
+            states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
@@ -295,12 +300,12 @@ class Parser(TrainablePipe):
         # gradients of the student's transition distributions relative to the
         # teacher's distributions.
 
-        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
-            max_moves=max_moves)
+        student_inputs = TransitionModelInputs(docs=student_docs,
+            states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
+        actions = _states_diff_to_actions(states, student_states)
         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-            moves=self.moves, actions=actions)
+            states=states, moves=teacher_pipe.moves, actions=actions)
         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@@ -550,7 +555,7 @@ class Parser(TrainablePipe):
         set_dropout_rate(self.model, 0.0)
         student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
+        actions = _states_to_actions(student_states)
         teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
         _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
 
@@ -670,7 +675,7 @@ class Parser(TrainablePipe):
                     raise ValueError(Errors.E149) from None
         return self
 
-    def _init_batch(self, teacher_step_model, docs, max_length):
+    def _init_batch_from_teacher(self, teacher_pipe, docs, max_length):
         """Make a square batch of length equal to the shortest transition
         sequence or a cap. A long
         doc will get multiple states. Let's say we have a doc of length 2*N,
@@ -679,10 +684,12 @@ class Parser(TrainablePipe):
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
         cdef:
-            StateClass start_state
             StateClass state
-            Transition action
-        all_states = self.moves.init_batch(docs)
+            TransitionSystem moves = teacher_pipe.moves
+
+        # Start with the same heuristic as in supervised training: exclude
+        # docs that are within the maximum length.
+        all_states = moves.init_batch(docs)
         states = []
         to_cut = []
         for state, doc in zip(all_states, docs):
@@ -691,18 +698,28 @@ class Parser(TrainablePipe):
                     states.append(state)
                 else:
                     to_cut.append(state)
+
+        if not to_cut:
+            return states
+
+        # Parse the states that are too long with the teacher's parsing model.
+        teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
+            states=[state.copy() for state in to_cut])
+        (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs)
+
+        # Step through the teacher's actions and store every state after
+        # each multiple of max_length.
+        teacher_actions = _states_to_actions(teacher_states)
         while to_cut:
             states.extend(state.copy() for state in to_cut)
-            # Move states forward max_length actions.
-            length = 0
-            while to_cut and length < max_length:
-                teacher_scores = teacher_step_model.predict(to_cut)
-                self.transition_states(to_cut, teacher_scores)
-                # States that are completed do not need further cutting.
-                to_cut = [state for state in to_cut if not state.is_final()]
-                length += 1
-        return states
+            for step_actions in teacher_actions[:max_length]:
+                to_cut = moves.apply_actions(to_cut, step_actions)
+            teacher_actions = teacher_actions[max_length:]
 
+            if len(teacher_actions) < max_length:
+                break
+
+        return states
 
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
@@ -764,7 +781,7 @@ def _change_attrs(model, **kwargs):
             model.attrs[key] = value
 
 
-def states2actions(states: List[StateClass]) -> List[Ints1d]:
+def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
     cdef int step
     cdef StateClass state
     cdef StateC* c_state
@@ -785,3 +802,45 @@ def states2actions(states: List[StateClass]) -> List[Ints1d]:
         actions.append(numpy.array(step_actions, dtype="i"))
 
     return actions
+
+def _states_diff_to_actions(
+    before_states: List[StateClass],
+    after_states: List[StateClass]
+) -> List[Ints1d]:
+    """
+    Return for two sets of states the actions to go from the first set of
+    states to the second set of states. The histories of the first set of
+    states must be a prefix of the second set of states.
+    """
+    cdef StateClass before_state, after_state
+    cdef StateC* c_state_before
+    cdef StateC* c_state_after
+
+    assert len(before_states) == len(after_states)
+
+    # Check invariant: before states histories must be prefixes of after states.
+    for before_state, after_state in zip(before_states, after_states):
+        c_state_before = before_state.c
+        c_state_after = after_state.c
+
+        assert equal(c_state_before.history.begin(), c_state_before.history.end(),
+            c_state_after.history.begin())
+
+    actions = []
+    while True:
+        step = len(actions)
+
+        step_actions = []
+        for before_state, after_state in zip(before_states, after_states):
+            c_state_before = before_state.c
+            c_state_after = after_state.c
+            if step < c_state_after.history.size() - c_state_before.history.size():
+                step_actions.append(c_state_after.history[c_state_before.history.size() + step])
+
+        # We are done if we have exhausted all histories.
+        if len(step_actions) == 0:
+            break
+
+        actions.append(numpy.array(step_actions, dtype="i"))
+
+    return actions
diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py
new file mode 100644
index 00000000000..8c1cf7a9346
--- /dev/null
+++ b/spacy/tests/parser/test_model.py
@@ -0,0 +1,61 @@
+import numpy
+import pytest
+
+from spacy.lang.en import English
+from spacy.ml.tb_framework import TransitionModelInputs
+from spacy.training import Example
+
+TRAIN_DATA = [
+    (
+        "They trade mortgage-backed securities.",
+        {
+            "heads": [1, 1, 4, 4, 5, 1, 1],
+            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
+        },
+    ),
+    (
+        "I like London and Berlin.",
+        {
+            "heads": [1, 1, 1, 2, 2, 1],
+            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
+        },
+    ),
+]
+
+
+@pytest.fixture
+def nlp_parser():
+    nlp = English()
+    parser = nlp.add_pipe("parser")
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        for dep in annotations["deps"]:
+            parser.add_label(dep)
+    nlp.initialize()
+
+    return nlp, parser
+
+
+def test_incorrect_number_of_actions(nlp_parser):
+    nlp, parser = nlp_parser
+    doc = nlp.make_doc("test")
+
+    # Too many actions for the number of docs
+    with pytest.raises(AssertionError):
+        parser.model.predict(
+            TransitionModelInputs(
+                docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
+            )
+        )
+
+    # Too few actions for the number of docs
+    with pytest.raises(AssertionError):
+        parser.model.predict(
+            TransitionModelInputs(
+                docs=[doc, doc],
+                moves=parser.moves,
+                actions=[numpy.array([0], dtype="i")],
+            )
+        )

From a15572a810c127f297050c4d684bff6e020a2887 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Tue, 7 Mar 2023 13:10:45 +0100
Subject: [PATCH 455/504] Drop support for EntityLinker_v1. (#12377)

---
 spacy/tests/pipeline/test_entity_linker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 33e8d47400e..9d533a69977 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1090,7 +1090,6 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
-    from spacy_legacy.components.entity_linker import EntityLinker_v1
 
     # Ensure that the legacy architectures still work
     vector_length = 3

From 798533626c69915b3982c78347e91e59b7880e11 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 20 Mar 2023 12:25:18 +0100
Subject: [PATCH 456/504] Entity linking: use `SpanGroup` instead of
 `Iterable[Span]` for mentions (#12344)

* Convert Candidate from Cython to Python class.

* Format.

* Fix .entity_ typo in _add_activations() usage.

* Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span].

* Update docs.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update doc string of BaseCandidate.__init__().

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate.

* Adjust Candidate to support and mandate numerical entity IDs.

* Format.

* Fix docstring and docs.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename alias -> mention.

* Refactor Candidate attribute names. Update docs and tests accordingly.

* Refacor Candidate attributes and their usage.

* Format.

* Fix mypy error.

* Update error code in line with v4 convention.

* Reverse erroneous changes during merge.

* Update return type in EL tests.

* Re-add Candidate to setup.py.

* Format updated docs.

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tests/pipeline/test_entity_linker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 9d533a69977..32e7a265f37 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1090,7 +1090,6 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
-
     # Ensure that the legacy architectures still work
     vector_length = 3
     nlp = English()

From 52dd3bcb0d7067f427f9a963c0af8ef8e3366885 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 6 Apr 2023 16:01:59 +0200
Subject: [PATCH 457/504] Enforce that Span.start/end(_char) remain valid and
 in sync (#12268)

* Enforce that Span.start/end(_char) remain valid and in sync

Allowing span attributes to be writable starting in v3 has made it
possible for the internal `Span.start/end/start_char/end_char` to get
out-of-sync or have invalid values.

This checks that the values are valid and syncs the token and char
offsets if any attributes are modified directly. It does not yet handle
the case where the underlying doc is modified.

* Format
---
 spacy/tests/doc/test_span.py | 47 ++++++++++++++++++++++++++++++++++
 spacy/tokens/span.pyx        | 49 +++++++++++++++++++++++++++---------
 2 files changed, 84 insertions(+), 12 deletions(-)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 74874624888..0b05ca7c123 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -696,3 +696,50 @@ def test_span_ent_id(en_tokenizer):
     doc.ents = [span]
     assert doc.ents[0].ent_id_ == "ID2"
     assert doc[1].ent_id_ == "ID2"
+
+
+def test_span_start_end_sync(en_tokenizer):
+    doc = en_tokenizer("a bc def e fghij kl")
+    # can create and edit span starts/ends
+    span = doc[2:4]
+    span.start_char = 2
+    span.end = 5
+    assert span == doc[span.start : span.end]
+    assert span == doc.char_span(span.start_char, span.end_char)
+    # cannot set completely out of bounds starts/ends
+    with pytest.raises(IndexError):
+        span.start = -1
+    with pytest.raises(IndexError):
+        span.end = -1
+    with pytest.raises(IndexError):
+        span.start_char = len(doc.text) + 1
+    with pytest.raises(IndexError):
+        span.end = len(doc.text) + 1
+    # test all possible char starts/ends
+    span = doc[0 : len(doc)]
+    token_char_starts = [token.idx for token in doc]
+    token_char_ends = [token.idx + len(token.text) for token in doc]
+    for i in range(len(doc.text)):
+        if i not in token_char_starts:
+            with pytest.raises(ValueError):
+                span.start_char = i
+        else:
+            span.start_char = i
+    span = doc[0 : len(doc)]
+    for i in range(len(doc.text)):
+        if i not in token_char_ends:
+            with pytest.raises(ValueError):
+                span.end_char = i
+        else:
+            span.end_char = i
+    # start must be <= end
+    span = doc[1:3]
+    with pytest.raises(ValueError):
+        span.start = 4
+    with pytest.raises(ValueError):
+        span.end = 0
+    span = doc.char_span(2, 8)
+    with pytest.raises(ValueError):
+        span.start_char = 9
+    with pytest.raises(ValueError):
+        span.end_char = 1
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 79a907abe6d..43779f20dc8 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -793,36 +793,61 @@ cdef class Span:
             return self.span_c().start
 
         def __set__(self, int start):
-            if start < 0:
-                raise IndexError("TODO")
-            self.span_c().start = start
+            if start < 0 or start > self.doc.length:
+                raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
+            cdef SpanC* span_c = self.span_c()
+            if start > span_c.end:
+                raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
+            span_c.start = start
+            span_c.start_char = self.doc.c[start].idx
 
     property end:
         def __get__(self):
             return self.span_c().end
 
         def __set__(self, int end):
-            if end < 0:
-                raise IndexError("TODO")
-            self.span_c().end = end
+            if end < 0 or end > self.doc.length:
+                raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
+            cdef SpanC* span_c = self.span_c()
+            if span_c.start > end:
+                raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
+            span_c.end = end
+            if end > 0:
+                span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length
+            else:
+                span_c.end_char = 0
 
     property start_char:
         def __get__(self):
             return self.span_c().start_char
 
         def __set__(self, int start_char):
-            if start_char < 0:
-                raise IndexError("TODO")
-            self.span_c().start_char = start_char
+            if start_char < 0 or start_char > len(self.doc.text):
+                raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
+            cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
+            if start < 0:
+                raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
+            cdef SpanC* span_c = self.span_c()
+            if start_char > span_c.end_char:
+                raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char))
+            span_c.start_char = start_char
+            span_c.start = start
 
     property end_char:
         def __get__(self):
             return self.span_c().end_char
 
         def __set__(self, int end_char):
-            if end_char < 0:
-                raise IndexError("TODO")
-            self.span_c().end_char = end_char
+            if end_char < 0 or end_char > len(self.doc.text):
+                raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
+            cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
+            if end < 0:
+                raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
+            cdef SpanC* span_c = self.span_c()
+            if span_c.start_char > end_char:
+                raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char))
+            span_c.end_char = end_char
+            span_c.end = end
 
     property label:
         def __get__(self):

From 804ef25e018e79e067e3fcade3835d90e51be5ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 21 Apr 2023 13:49:40 +0200
Subject: [PATCH 458/504] Add distillation loop (#12542)

* Add distillation initialization and loop

* Fix up configuration keys

* Add docstring

* Type annotations

* init_nlp_distill -> init_nlp_student

* Do not resolve dot name distill corpus in initialization

(Since we don't use it.)

* student: do not request use of optimizer in student pipe

We apply finish up the updates once in the training loop instead.

Also add the necessary logic to `Language.distill` to mirror
`Language.update`.

* Correctly determine sort key in subdivide_batch

* Fix _distill_loop docstring wrt. stopping condition

* _distill_loop: fix distill_data docstring

Make similar changes in train_while_improving, since it also had
incorrect types and missing type annotations.

* Move `set_{gpu_allocator,seed}_from_config` to spacy.util

* Update Language.update docs for the sgd argument

* Type annotation

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/util.py b/spacy/util.py
index ae1135234e3..624fffe865d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -7,6 +7,7 @@
 import thinc
 from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
 from thinc.api import ConfigValidationError, Model, constant as constant_schedule
+from thinc.api import fix_random_seed, set_gpu_allocator
 import functools
 import itertools
 import logging

From 0d277d15a0092e98ada80019e4e837ef38c9305e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 11:41:03 +0200
Subject: [PATCH 459/504] isort all the things

---
 spacy/cli/_util.py                          | 11 ------
 spacy/cli/convert.py                        |  2 --
 spacy/cli/debug_data.py                     |  8 -----
 spacy/cli/download.py                       | 12 +++++--
 spacy/matcher/phrasematcher.pyx             |  5 +--
 spacy/ml/models/tok2vec.py                  |  2 --
 spacy/ml/tb_framework.pyx                   | 37 +++++++++++++++------
 spacy/morphology.pxd                        |  2 +-
 spacy/morphology.pyx                        |  6 ++--
 spacy/pipeline/_parser_internals/_state.pxd |  2 --
 spacy/pipeline/dep_parser.py                |  7 ----
 spacy/pipeline/ner.py                       | 14 --------
 spacy/pipeline/span_ruler.py                |  8 -----
 spacy/pipeline/textcat.py                   |  4 ---
 spacy/schemas.py                            |  9 -----
 spacy/strings.pxd                           |  3 --
 spacy/tests/doc/test_span.py                |  1 -
 spacy/tests/doc/test_underscore.py          |  1 +
 spacy/tests/parser/test_ner.py              |  2 --
 spacy/tests/parser/test_parse.py            | 10 +++---
 spacy/tests/pipeline/test_entity_ruler.py   |  6 ----
 spacy/tests/test_misc.py                    | 20 +++++++----
 spacy/tokenizer.pxd                         |  4 ---
 spacy/tokenizer.pyx                         |  6 ----
 spacy/tokens/__init__.py                    |  3 +-
 spacy/tokens/doc.pyx                        | 15 +--------
 spacy/tokens/morphanalysis.pxd              |  7 ++--
 spacy/tokens/morphanalysis.pyx              |  4 ---
 spacy/tokens/span.pxd                       |  1 -
 spacy/tokens/token.pyx                      |  1 +
 spacy/training/__init__.py                  |  3 --
 spacy/training/callbacks.py                 |  6 ++--
 spacy/util.py                               | 11 +-----
 33 files changed, 74 insertions(+), 159 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 644f3e5ef24..eed61119070 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,10 +1,3 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
-from typing import TYPE_CHECKING, overload
-import sys
-import shutil
-from pathlib import Path
-from wasabi import msg, Printer
-import srsly
 import hashlib
 import os
 import shutil
@@ -34,10 +27,6 @@
 from typer.main import get_command
 from wasabi import Printer, msg
 
-from ..schemas import ProjectConfigSchema, validate
-from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
-from ..errors import RENAMED_LANGUAGE_CODES
 from .. import about
 from ..errors import RENAMED_LANGUAGE_CODES
 from ..schemas import ProjectConfigSchema, validate
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 19591a05c94..a282e59c749 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -8,8 +8,6 @@
 import srsly
 from wasabi import Printer
 
-from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
-from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training import docs_to_json
 from ..training.converters import (
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 60f760ccb52..7a98e6d563c 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,11 +1,3 @@
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
-from typing import Literal, cast, overload
-from pathlib import Path
-from collections import Counter
-import sys
-import srsly
-from wasabi import Printer, MESSAGES, msg
-import typer
 import math
 import sys
 from collections import Counter
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index ea7a06c5417..0635522930b 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,9 +7,15 @@
 from wasabi import msg
 
 from .. import about
-from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version, get_installed_models
-from ..util import get_package_version
+from ..util import (
+    get_installed_models,
+    get_minor_version,
+    get_package_version,
+    is_package,
+    is_prerelease_version,
+    run_command,
+)
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
 
 
 @app.command(
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 87da110de04..107d7d926ee 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,8 +1,9 @@
 # cython: infer_types=True, profile=True
-from typing import List
 from collections import defaultdict
+from typing import List
+
 from libc.stdint cimport uintptr_t
-from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
+from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
 
 import warnings
 
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index a605d32cd40..61bc7291e2e 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -22,8 +22,6 @@
 from ...attrs import intify_attr
 from ...errors import Errors
 from ...ml import character_embed
-from ..staticvectors import StaticVectors
-from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
 from ...tokens import Doc
 from ...util import registry
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 9b2114900d3..fd0af12ceab 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -1,28 +1,45 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import List, Tuple, Any, Optional, TypeVar, cast
-from libc.string cimport memset, memcpy
+from typing import Any, List, Optional, Tuple, TypeVar, cast
+
 from libc.stdlib cimport calloc, free, realloc
+from libc.string cimport memcpy, memset
 from libcpp.vector cimport vector
+
 import numpy
+
 cimport numpy as np
-from thinc.api import Model, normal_init, chain, list2array, Linear
-from thinc.api import uniform_init, glorot_uniform_init, zero_init
-from thinc.api import NumpyOps
+
+from thinc.api import (
+    Linear,
+    Model,
+    NumpyOps,
+    chain,
+    glorot_uniform_init,
+    list2array,
+    normal_init,
+    uniform_init,
+    zero_init,
+)
+
 from thinc.backends.cblas cimport CBlas, saxpy, sgemm
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
-from thinc.types import Ints1d, Ints2d
+
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
 
 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
 from ..pipeline._parser_internals.batch import GreedyBatch
+
 from ..pipeline._parser_internals._parser_utils cimport arg_max
-from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
-from ..pipeline._parser_internals.transition_system cimport TransitionSystem
 from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..pipeline._parser_internals.transition_system cimport (
+    TransitionSystem,
+    c_apply_actions,
+    c_transition_batch,
+)
+
 from ..tokens.doc import Doc
 from ..util import registry
 
-
 State = Any  # TODO
 
 
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index ab8f854497b..df467908940 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,8 +1,8 @@
 cimport numpy as np
 from libc.stdint cimport uint32_t, uint64_t
+from libcpp.memory cimport shared_ptr
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.memory cimport shared_ptr
 
 from .strings cimport StringStore
 from .structs cimport MorphAnalysisC
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 57e4e7d10a3..9a8b8bb51b7 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,10 +1,12 @@
 # cython: infer_types
 import warnings
-from typing import Union, Tuple, List, Dict, Optional
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy
+
 from cython.operator cimport dereference as deref
 from libcpp.memory cimport shared_ptr
 
-from .errors import Warnings
 from . import symbols
 from .errors import Warnings
 
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 1b6b25e5f16..1c61ac271d8 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,5 +1,4 @@
 cimport libcpp
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as incr
 from libc.stdint cimport uint32_t, uint64_t
@@ -8,7 +7,6 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.py
index c996074d2c4..b4961487b83 100644
--- a/spacy/pipeline/dep_parser.py
+++ b/spacy/pipeline/dep_parser.py
@@ -4,13 +4,6 @@
 
 from thinc.api import Config, Model
 
-from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
-from ._parser_internals.arc_eager import ArcEager
-
-from ._parser_internals.arc_eager cimport ArcEager
-from .transition_parser cimport Parser
-
 from ..language import Language
 from ..scorer import Scorer
 from ..training import remove_bilu_prefix
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.py
index 41280c49390..1c7cf151385 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.py
@@ -11,20 +11,6 @@
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
 from .transition_parser import Parser
-from ._parser_internals.ner import BiluoPushDown
-from ..language import Language
-from ..scorer import get_ner_prf, PRFScore
-from ..training import validate_examples
-from ..util import registry
-from ..training import remove_bilu_prefix
-
-from ._parser_internals.ner cimport BiluoPushDown
-from .transition_parser cimport Parser
-
-from ..language import Language
-from ..scorer import get_ner_prf
-from ..training import remove_bilu_prefix
-from ..util import registry
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index 3f876598013..cd8fea36b47 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -17,14 +17,6 @@
 
 import srsly
 
-from .pipe import Pipe
-from ..training import Example
-from ..language import Language
-from ..errors import Errors, Warnings
-from ..util import ensure_path, SimpleFrozenList, registry
-from ..tokens import Doc, Span
-from ..scorer import Scorer, get_ner_prf
-from ..matcher import Matcher, PhraseMatcher
 from .. import util
 from ..errors import Errors, Warnings
 from ..language import Language
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 79a98b9bc5f..13841dd7bbb 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,7 +1,3 @@
-from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
-from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
-from thinc.types import Floats2d
-import numpy
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
diff --git a/spacy/schemas.py b/spacy/schemas.py
index cf9d3064065..6b41bb5b2b7 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,12 +1,3 @@
-from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
-from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
-from enum import Enum
-from pydantic import BaseModel, Field, ValidationError, validator, create_model
-from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
-from pydantic.main import ModelMetaclass
-from thinc.api import Optimizer, ConfigValidationError, Model
-from thinc.config import Promise
-from collections import defaultdict
 import inspect
 import re
 from collections import defaultdict
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index 688dbc46261..c05731c9a15 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,6 +1,3 @@
-from libc.stdint cimport int64_t, uint32_t
-from libcpp.vector cimport vector
-from libcpp.set cimport set
 from cymem.cymem cimport Pool
 from libc.stdint cimport int64_t, uint32_t
 from libcpp.set cimport set
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 0b05ca7c123..cf850a2234d 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -6,7 +6,6 @@
 from spacy.attrs import LENGTH, ORTH
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span, SpanGroup, Token
-from spacy.vocab import Vocab
 from spacy.util import filter_spans
 from spacy.vocab import Vocab
 
diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index d09126a7b2c..c6bb5ad4e33 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -4,6 +4,7 @@
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
+
 # Helper functions
 def _get_tuple(s: Span):
     return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index d9cbf5e8c72..f0efc3a63fd 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,8 +17,6 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
-from thinc.api import fix_random_seed
-import logging
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 9eccb056ce2..fe82ad2fde0 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,17 +1,19 @@
 import itertools
-import pytest
+
 import numpy
+import pytest
 from numpy.testing import assert_equal
 from thinc.api import Adam, fix_random_seed
 
 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.training import Example
+from spacy.pipeline import DependencyParser
+from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.tokens import Doc
+from spacy.training import Example
 from spacy.vocab import Vocab
-from spacy import util, registry
-from thinc.api import fix_random_seed
 
 from ..util import apply_transition_sequence, make_tempdir
 
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 74731140688..12f2c9def2d 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -2,12 +2,6 @@
 from thinc.api import NumpyOps, get_current_ops
 
 from spacy import registry
-from spacy.tokens import Doc, Span
-from spacy.language import Language
-from spacy.lang.en import English
-from spacy.pipeline import EntityRecognizer, merge_entities
-from spacy.pipeline import SpanRuler
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
 from spacy.lang.en import English
 from spacy.language import Language
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 160e2ecf335..c05ef625e11 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,13 +1,19 @@
 import ctypes
 import os
 from pathlib import Path
-from spacy.about import __version__ as spacy_version
-from spacy import util
-from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
-from spacy.util import find_available_port
-from thinc.api import Config, Optimizer, ConfigValidationError
-from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
+
+import pytest
+from pydantic import ValidationError
+from thinc.api import (
+    Config,
+    ConfigValidationError,
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+    set_current_ops,
+)
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 2610532b75d..b2e50969462 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -2,10 +2,6 @@ from cymem.cymem cimport Pool
 from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 
-from .typedefs cimport hash_t
-from .structs cimport LexemeC, SpanC, TokenC
-from .tokens.doc cimport Doc
-from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
 from .structs cimport LexemeC, SpanC, TokenC
 from .tokens.doc cimport Doc
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 2d6c879e360..1fc5f310920 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -9,17 +9,11 @@ from preshed.maps cimport PreshMap
 
 import re
 
-from .tokens.doc cimport Doc
-from .strings cimport hash_string
 from .lexeme cimport EMPTY_LEXEME
 from .strings cimport hash_string
 from .tokens.doc cimport Doc
 
-from .attrs import intify_attrs
-from .symbols import ORTH, NORM
-from .errors import Errors
 from . import util
-from .util import get_words_and_spaces
 from .attrs import intify_attrs
 from .errors import Errors
 from .scorer import Scorer
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index 16c43485340..7617e462fde 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -4,7 +4,6 @@
 from .morphanalysis import MorphAnalysis
 from .span import Span
 from .span_group import SpanGroup
-from .doc_bin import DocBin
-from .morphanalysis import MorphAnalysis
+from .token import Token
 
 __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 169199bc563..4b8a15a65fd 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -20,15 +20,8 @@ from thinc.util import copy_array
 
 from .span cimport Span
 from .token cimport MISSING_DEP
-from .span_groups import SpanGroups
-from .token cimport Token
-from ..lexeme cimport Lexeme, EMPTY_LEXEME
-from ..typedefs cimport attr_t, flags_t
-from ..attrs cimport attr_id_t
-from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
-from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 
-from ._dict_proxies import SpanGroups
+from .span_groups import SpanGroups
 
 from ..attrs cimport (
     DEP,
@@ -57,12 +50,6 @@ from ..attrs import IDS, intify_attr
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
 from ..morphology import Morphology
-from .. import util
-from .. import parts_of_speech
-from .. import schemas
-from .underscore import Underscore, get_ext_args
-from .retokenizer import Retokenizer
-from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from ..util import get_words_and_spaces
 from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS
 from .retokenizer import Retokenizer
diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd
index f866488ecc2..73922c62b9b 100644
--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@@ -1,8 +1,9 @@
-from ..vocab cimport Vocab
-from ..typedefs cimport hash_t
-from ..morphology cimport MorphAnalysisC
 from libcpp.memory cimport shared_ptr
 
+from ..morphology cimport MorphAnalysisC
+from ..typedefs cimport hash_t
+from ..vocab cimport Vocab
+
 
 cdef class MorphAnalysis:
     cdef readonly Vocab vocab
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index be9f32c99d3..e5665496ff4 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -9,10 +9,6 @@ from libcpp.memory cimport shared_ptr
 from ..morphology cimport MorphAnalysisC, check_feature, get_by_field, list_features
 from ..typedefs cimport attr_t, hash_t
 from ..vocab cimport Vocab
-from ..typedefs cimport hash_t, attr_t
-from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
-from libcpp.memory cimport shared_ptr
-from cython.operator cimport dereference as deref
 
 
 cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index 68f722a13cb..fb592e68bd8 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,4 +1,3 @@
-from libcpp.memory cimport shared_ptr
 cimport numpy as np
 from libcpp.memory cimport shared_ptr
 
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index ec79f19cf20..d9fdcb75263 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -40,6 +40,7 @@ from .. import parts_of_speech
 from ..attrs import IOB_STRINGS
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
+
 from cython.operator cimport dereference as deref
 
 from cython.operator cimport dereference as deref
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 9445d0b63a5..adfc2bb6658 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,6 +1,3 @@
-from .corpus import Corpus, JsonlCorpus  # noqa: F401
-from .example import Example, validate_examples, validate_get_examples  # noqa: F401
-from .example import validate_distillation_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index 21c3d56a118..c2f3b8b51fa 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -1,11 +1,9 @@
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import Callable, Optional
 
 from ..errors import Errors
+from ..language import Language
 from ..util import load_model, logger, registry
 
-if TYPE_CHECKING:
-    from ..language import Language
-
 
 @registry.callbacks("spacy.copy_from_base_model.v1")
 def create_copy_from_base_model(
diff --git a/spacy/util.py b/spacy/util.py
index 624fffe865d..ae9837e3afe 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -2,13 +2,7 @@
 import importlib
 import importlib.metadata
 import importlib.util
-import re
-from pathlib import Path
-import thinc
-from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
-from thinc.api import ConfigValidationError, Model, constant as constant_schedule
-from thinc.api import fix_random_seed, set_gpu_allocator
-import functools
+import inspect
 import itertools
 import logging
 import os
@@ -65,9 +59,6 @@
     cupy = None
 
 
-from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows
-from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, is_windows
 from .errors import Errors, Warnings

From 9f5fc1a1653568c5b860deb62ce30a0f6a33032f Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 19 Jul 2023 16:38:29 +0200
Subject: [PATCH 460/504] merge fixes

---
 spacy/pipeline/transition_parser.pyx | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index c1bc2c28da9..8ff92aecf74 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -70,6 +70,11 @@ cdef extern from "<algorithm>" namespace "std" nogil:
     bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
 
 
+
+# TODO: Remove when we switch to Cython 3.
+cdef extern from "<algorithm>" namespace "std" nogil:
+    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
+
 NUMPY_OPS = NumpyOps()
 
 

From 1c75b1b0e3e91f2c8c23383cf16752fe3c7c30df Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 19 Jul 2023 17:41:29 +0200
Subject: [PATCH 461/504] cython fixes and cleanup

---
 spacy/matcher/phrasematcher.pyx               |  2 -
 spacy/ml/tb_framework.pyx                     | 55 ++++++++++---------
 spacy/morphology.pyx                          |  6 +-
 spacy/pipeline/_parser_internals/ner.pyx      |  1 -
 .../_parser_internals/transition_system.pxd   |  4 +-
 .../_parser_internals/transition_system.pyx   | 21 ++++---
 spacy/pipeline/transition_parser.pyx          | 54 ++++++++++--------
 spacy/tokens/span.pyx                         |  2 -
 8 files changed, 72 insertions(+), 73 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 107d7d926ee..d1a8eaf33c4 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -160,7 +160,6 @@ cdef class PhraseMatcher:
         del self._callbacks[key]
         del self._docs[key]
 
-
     def _add_from_arrays(self, key, specs, *, on_match=None):
         """Add a preprocessed list of specs, with an optional callback.
 
@@ -196,7 +195,6 @@ cdef class PhraseMatcher:
                 result = internal_node
             map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
 
-
     def add(self, key, docs, *, on_match=None):
         """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
         key, a list of one or more patterns, and (optionally) an on_match callback.
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index fd0af12ceab..ed04045a6a5 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -1,5 +1,5 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, TypeVar, cast
+from typing import Any, List, Optional, Tuple, cast
 
 from libc.stdlib cimport calloc, free, realloc
 from libc.string cimport memcpy, memset
@@ -23,7 +23,7 @@ from thinc.api import (
 
 from thinc.backends.cblas cimport CBlas, saxpy, sgemm
 
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
+from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
 
 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
@@ -136,7 +136,7 @@ def init(
     Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
 ):
     if X is not None:
-        docs, moves = X
+        docs, _ = X
         model.get_ref("tok2vec").initialize(X=docs)
     else:
         model.get_ref("tok2vec").initialize()
@@ -145,7 +145,7 @@ def init(
         current_nO = model.maybe_get_dim("nO")
         if current_nO is None or current_nO != inferred_nO:
             model.attrs["resize_output"](model, inferred_nO)
-    nO = model.get_dim("nO")
+    # nO = model.get_dim("nO")
     nP = model.get_dim("nP")
     nH = model.get_dim("nH")
     nI = model.get_dim("nI")
@@ -192,9 +192,10 @@ class TransitionModelInputs:
         self,
         docs: List[Doc],
         moves: TransitionSystem,
-        actions: Optional[List[Ints1d]]=None,
-        max_moves: int=0,
-        states: Optional[List[State]]=None):
+        actions: Optional[List[Ints1d]] = None,
+        max_moves: int = 0,
+        states: Optional[List[State]] = None,
+    ):
         """
         actions (Optional[List[Ints1d]]): actions to apply for each Doc.
         docs (List[Doc]): Docs to predict transition sequences for.
@@ -234,12 +235,12 @@ def forward(model, inputs: TransitionModelInputs, is_train: bool):
         return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
     else:
         return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
-            feats, backprop_feats, seen_mask, is_train, actions=actions,
-            max_moves=inputs.max_moves)
+                                 feats, backprop_feats, seen_mask, is_train, actions=actions,
+                                 max_moves=inputs.max_moves)
 
 
 def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
-                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+                        np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None):
     cdef vector[StateC*] c_states
     cdef StateClass state
     for state in states:
@@ -257,9 +258,10 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State
 
     return (states, scores), backprop
 
+
 cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
                        WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i, j
+    cdef int i
     cdef vector[StateC *] unfinished
     cdef ActivationsC activations = _alloc_activations(sizes)
     cdef np.ndarray step_scores
@@ -276,7 +278,7 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
             if actions is None:
                 # Validate actions, argmax, take action.
                 c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
-                    sizes.states)
+                                   sizes.states)
             else:
                 c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
             for i in range(sizes.states):
@@ -302,8 +304,8 @@ def _forward_fallback(
     backprop_feats,
     seen_mask,
     is_train: bool,
-    actions: Optional[List[Ints1d]]=None,
-    max_moves: int=0):
+    actions: Optional[List[Ints1d]] = None,
+        max_moves: int = 0):
     nF = model.get_dim("nF")
     output = model.get_ref("output")
     hidden_b = model.get_param("hidden_b")
@@ -371,7 +373,7 @@ def _forward_fallback(
             for clas in set(model.attrs["unseen_classes"]):
                 if (d_scores[:, clas] < 0).any():
                     model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False
+        d_scores *= seen_mask == False  # no-cython-lint
         # Calculate the gradients for the parameters of the output layer.
         # The weight gemm is (nS, nO) @ (nS, nH).T
         output.inc_grad("b", d_scores.sum(axis=0))
@@ -571,13 +573,13 @@ cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
         A._max_size = n.states
     else:
         A.token_ids = <int*>realloc(A.token_ids,
-            n.states * n.feats * sizeof(A.token_ids[0]))
+                                    n.states * n.feats * sizeof(A.token_ids[0]))
         A.unmaxed = <float*>realloc(A.unmaxed,
-            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
         A.hiddens = <float*>realloc(A.hiddens,
-            n.states * n.hiddens * sizeof(A.hiddens[0]))
+                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
         A.is_valid = <int*>realloc(A.is_valid,
-            n.states * n.classes * sizeof(A.is_valid[0]))
+                                   n.states * n.classes * sizeof(A.is_valid[0]))
         A._max_size = n.states
     A._curr_size = n.states
 
@@ -599,9 +601,9 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
     else:
         # Compute hidden-to-output
         sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-                      1.0, <const float *>A.hiddens, n.hiddens,
-                      <const float *>W.hidden_weights, n.hiddens,
-                      0.0, scores, n.classes)
+                     1.0, <const float *>A.hiddens, n.hiddens,
+                     <const float *>W.hidden_weights, n.hiddens,
+                     0.0, scores, n.classes)
         # Add bias
         for i in range(n.states):
             saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
@@ -617,12 +619,12 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
                 scores[i*n.classes+j] = min_
 
 
-cdef void _sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f, i
+cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
+                              const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f
     cdef const float* feature
     cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces
+    cdef int O = n.hiddens * n.pieces  # no-cython-lint
     cdef int F = n.feats
     cdef int T = n.tokens
     padding = cached + (T * F * O)
@@ -637,4 +639,3 @@ cdef void _sum_state_features(CBlas cblas, float* output,
                 feature = &cached[idx]
             saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
         token_ids += F
-
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 9a8b8bb51b7..665e964bfd5 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -80,15 +80,13 @@ cdef class Morphology:
         out.sort(key=lambda x: x[0])
         return dict(out)
 
-
     def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
         norm_feats_string = self.FEATURE_SEP.join([
-                self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
+            self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
             for field, values in feats.items()
-        ])
+            ])
         return norm_feats_string or self.EMPTY_MORPH
 
-
     cdef hash_t _add(self, features):
         """Insert a morphological analysis in the morphology table, if not
         already present. The morphological analysis may be provided in the UD
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index dedf806db91..bd4e06dedb3 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -4,7 +4,6 @@ from libc.stdint cimport int32_t
 from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
 from cymem.cymem cimport Pool
-from libc.stdint cimport int32_t
 from libcpp.memory cimport shared_ptr
 from libcpp.vector cimport vector
 from cymem.cymem cimport Pool
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 66cc7747b69..08baed932ba 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -60,7 +60,7 @@ cdef class TransitionSystem:
 
 
 cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-    int batch_size) nogil
+                          int batch_size) nogil
 
 cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-        int nr_class, int batch_size) nogil
+                             int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index 7bd39ba43c5..ae1cf890f3e 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -291,19 +291,19 @@ cdef class TransitionSystem:
 
 
 cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-    int batch_size) nogil:
-        cdef int i
-        cdef Transition action
-        cdef StateC* state
-        for i in range(batch_size):
-            state = states[i]
-            action = moves.c[actions[i]]
-            action.do(state, action.label)
-            state.history.push_back(action.clas)
+                          int batch_size) nogil:
+    cdef int i
+    cdef Transition action
+    cdef StateC* state
+    for i in range(batch_size):
+        state = states[i]
+        action = moves.c[actions[i]]
+        action.do(state, action.label)
+        state.history.push_back(action.clas)
 
 
 cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-    int nr_class, int batch_size) nogil:
+                             int nr_class, int batch_size) nogil:
     is_valid = <int*>calloc(moves.n_moves, sizeof(int))
     cdef int i, guess
     cdef Transition action
@@ -319,4 +319,3 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa
             action.do(states[i], action.label)
             states[i].history.push_back(guess)
     free(is_valid)
-
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 8ff92aecf74..690c1ebb8ed 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -251,12 +251,13 @@ class Parser(TrainablePipe):
         raise NotImplementedError
 
     def distill(self,
-               teacher_pipe: Optional[TrainablePipe],
-               examples: Iterable["Example"],
-               *,
-               drop: float=0.0,
-               sgd: Optional[Optimizer]=None,
-               losses: Optional[Dict[str, float]]=None):
+                teacher_pipe: Optional[TrainablePipe],
+                examples: Iterable["Example"],
+                *,
+                drop: float = 0.0,
+                sgd: Optional[Optimizer] = None,
+                losses: Optional[Dict[str, float]] = None
+                ):
         """Train a pipe (the student) on the predictions of another pipe
         (the teacher). The student is trained on the transition probabilities
         of the teacher.
@@ -306,11 +307,13 @@ class Parser(TrainablePipe):
         # teacher's distributions.
 
         student_inputs = TransitionModelInputs(docs=student_docs,
-            states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
+                                               states=[state.copy() for state in states],
+                                               moves=self.moves,
+                                               max_moves=max_moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
         actions = _states_diff_to_actions(states, student_states)
         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-            states=states, moves=teacher_pipe.moves, actions=actions)
+                                               states=states, moves=teacher_pipe.moves, actions=actions)
         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@@ -324,8 +327,8 @@ class Parser(TrainablePipe):
         return losses
 
     def get_teacher_student_loss(
-        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
-        normalize: bool=False,
+            self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
+            normalize: bool = False,
     ) -> Tuple[float, List[Floats2d]]:
         """Calculate the loss and its gradient for a batch of student
         scores, relative to teacher scores.
@@ -348,9 +351,9 @@ class Parser(TrainablePipe):
         # ourselves.
 
         teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
-            axis=-1, inplace=True)
+                                                axis=-1, inplace=True)
         student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
-            axis=-1, inplace=True)
+                                                axis=-1, inplace=True)
 
         assert teacher_scores.shape == student_scores.shape
 
@@ -466,13 +469,15 @@ class Parser(TrainablePipe):
         else:
             init_states, gold_states, _ = self.moves.init_gold_batch(examples)
 
-        inputs = TransitionModelInputs(docs=docs, moves=self.moves,
-            max_moves=max_moves, states=[state.copy() for state in init_states])
+        inputs = TransitionModelInputs(docs=docs,
+                                       moves=self.moves,
+                                       max_moves=max_moves,
+                                       states=[state.copy() for state in init_states])
         (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
         if sum(s.shape[0] for s in scores) == 0:
             return losses
         d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
-            examples, max_moves)
+                                 examples, max_moves)
         backprop_scores((pred_states, d_scores))
         if sgd not in (None, False):
             self.finish_update(sgd)
@@ -513,9 +518,7 @@ class Parser(TrainablePipe):
         cdef TransitionSystem moves = self.moves
         cdef StateClass state
         cdef int clas
-        cdef int nF = self.model.get_dim("nF")
         cdef int nO = moves.n_moves
-        cdef int nS = sum([len(history) for history in histories])
         cdef Pool mem = Pool()
         cdef np.ndarray costs_i
         is_valid = <int*>mem.alloc(nO, sizeof(int))
@@ -582,8 +585,8 @@ class Parser(TrainablePipe):
 
         return losses
 
-    def update_beam(self, examples, *, beam_width,
-            drop=0., sgd=None, losses=None, beam_density=0.0):
+    def update_beam(self, examples, *, beam_width, drop=0.,
+                    sgd=None, losses=None, beam_density=0.0):
         raise NotImplementedError
 
     def set_output(self, nO):
@@ -708,9 +711,10 @@ class Parser(TrainablePipe):
             return states
 
         # Parse the states that are too long with the teacher's parsing model.
-        teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
-            states=[state.copy() for state in to_cut])
-        (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs)
+        teacher_inputs = TransitionModelInputs(docs=docs,
+                                               moves=moves,
+                                               states=[state.copy() for state in to_cut])
+        (teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)
 
         # Step through the teacher's actions and store every state after
         # each multiple of max_length.
@@ -808,6 +812,7 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
 
     return actions
 
+
 def _states_diff_to_actions(
     before_states: List[StateClass],
     after_states: List[StateClass]
@@ -828,8 +833,9 @@ def _states_diff_to_actions(
         c_state_before = before_state.c
         c_state_after = after_state.c
 
-        assert equal(c_state_before.history.begin(), c_state_before.history.end(),
-            c_state_after.history.begin())
+        assert equal(c_state_before.history.begin(),
+                     c_state_before.history.end(),
+                     c_state_after.history.begin())
 
     actions = []
     while True:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 43779f20dc8..65830ea0f5c 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -230,7 +230,6 @@ cdef class Span:
 
     @property
     def _(self):
-        cdef SpanC* span_c = self.span_c()
         """Custom extension attributes registered via `set_extension`."""
         cdef SpanC* span_c = self.span_c()
         return Underscore(Underscore.span_extensions, self,
@@ -950,7 +949,6 @@ cdef class Span:
             self.id_ = ent_id_
 
 
-
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
     # better candidates

From 4d509fa86bad3415e1fcd6129e411224900b28d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 14:38:05 +0100
Subject: [PATCH 462/504] Revert "Reimplement distillation with oracle cut size
 (#12214)"

This reverts commit e27c60a70263f7ab17968964de37e938653e37a2.
---
 spacy/ml/tb_framework.pyx            |   4 +-
 spacy/pipeline/transition_parser.pyx | 105 ++++++---------------------
 spacy/tests/parser/test_model.py     |  61 ----------------
 spacy/tests/parser/test_ner.py       |   5 +-
 4 files changed, 23 insertions(+), 152 deletions(-)
 delete mode 100644 spacy/tests/parser/test_model.py

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index ed04045a6a5..b81553323e4 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -268,11 +268,9 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
     cdef np.ndarray step_actions
 
     scores = []
-    while sizes.states >= 1 and (actions is None or len(actions) > 0):
+    while sizes.states >= 1:
         step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
         step_actions = actions[0] if actions is not None else None
-        assert step_actions is None or step_actions.size == sizes.states, \
-            f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
         with nogil:
             _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
             if actions is None:
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 690c1ebb8ed..68add292459 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -71,10 +71,6 @@ cdef extern from "<algorithm>" namespace "std" nogil:
 
 
 
-# TODO: Remove when we switch to Cython 3.
-cdef extern from "<algorithm>" namespace "std" nogil:
-    bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
-
 NUMPY_OPS = NumpyOps()
 
 
@@ -294,8 +290,8 @@ class Parser(TrainablePipe):
             # batch uniform length. Since we do not have a gold standard
             # sequence, we use the teacher's predictions as the gold
             # standard.
-            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
-            states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves)
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            states = self._init_batch(teacher_pipe, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
@@ -306,14 +302,12 @@ class Parser(TrainablePipe):
         # gradients of the student's transition distributions relative to the
         # teacher's distributions.
 
-        student_inputs = TransitionModelInputs(docs=student_docs,
-                                               states=[state.copy() for state in states],
-                                               moves=self.moves,
-                                               max_moves=max_moves)
+        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
+            max_moves=max_moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = _states_diff_to_actions(states, student_states)
+        actions = states2actions(student_states)
         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-                                               states=states, moves=teacher_pipe.moves, actions=actions)
+            moves=self.moves, actions=actions)
         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
 
         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@@ -563,7 +557,7 @@ class Parser(TrainablePipe):
         set_dropout_rate(self.model, 0.0)
         student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = _states_to_actions(student_states)
+        actions = states2actions(student_states)
         teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
         _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
 
@@ -683,7 +677,7 @@ class Parser(TrainablePipe):
                     raise ValueError(Errors.E149) from None
         return self
 
-    def _init_batch_from_teacher(self, teacher_pipe, docs, max_length):
+    def _init_batch(self, teacher_step_model, docs, max_length):
         """Make a square batch of length equal to the shortest transition
         sequence or a cap. A long
         doc will get multiple states. Let's say we have a doc of length 2*N,
@@ -692,12 +686,10 @@ class Parser(TrainablePipe):
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
         cdef:
+            StateClass start_state
             StateClass state
-            TransitionSystem moves = teacher_pipe.moves
-
-        # Start with the same heuristic as in supervised training: exclude
-        # docs that are within the maximum length.
-        all_states = moves.init_batch(docs)
+            Transition action
+        all_states = self.moves.init_batch(docs)
         states = []
         to_cut = []
         for state, doc in zip(all_states, docs):
@@ -706,30 +698,19 @@ class Parser(TrainablePipe):
                     states.append(state)
                 else:
                     to_cut.append(state)
-
-        if not to_cut:
-            return states
-
-        # Parse the states that are too long with the teacher's parsing model.
-        teacher_inputs = TransitionModelInputs(docs=docs,
-                                               moves=moves,
-                                               states=[state.copy() for state in to_cut])
-        (teacher_states, _) = teacher_pipe.model.predict(teacher_inputs)
-
-        # Step through the teacher's actions and store every state after
-        # each multiple of max_length.
-        teacher_actions = _states_to_actions(teacher_states)
         while to_cut:
             states.extend(state.copy() for state in to_cut)
-            for step_actions in teacher_actions[:max_length]:
-                to_cut = moves.apply_actions(to_cut, step_actions)
-            teacher_actions = teacher_actions[max_length:]
-
-            if len(teacher_actions) < max_length:
-                break
-
+            # Move states forward max_length actions.
+            length = 0
+            while to_cut and length < max_length:
+                teacher_scores = teacher_step_model.predict(to_cut)
+                self.transition_states(to_cut, teacher_scores)
+                # States that are completed do not need further cutting.
+                to_cut = [state for state in to_cut if not state.is_final()]
+                length += 1
         return states
 
+
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
         sequence or a cap. A long doc will get multiple states. Let's say we
@@ -790,7 +771,7 @@ def _change_attrs(model, **kwargs):
             model.attrs[key] = value
 
 
-def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
+def states2actions(states: List[StateClass]) -> List[Ints1d]:
     cdef int step
     cdef StateClass state
     cdef StateC* c_state
@@ -811,47 +792,3 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
         actions.append(numpy.array(step_actions, dtype="i"))
 
     return actions
-
-
-def _states_diff_to_actions(
-    before_states: List[StateClass],
-    after_states: List[StateClass]
-) -> List[Ints1d]:
-    """
-    Return for two sets of states the actions to go from the first set of
-    states to the second set of states. The histories of the first set of
-    states must be a prefix of the second set of states.
-    """
-    cdef StateClass before_state, after_state
-    cdef StateC* c_state_before
-    cdef StateC* c_state_after
-
-    assert len(before_states) == len(after_states)
-
-    # Check invariant: before states histories must be prefixes of after states.
-    for before_state, after_state in zip(before_states, after_states):
-        c_state_before = before_state.c
-        c_state_after = after_state.c
-
-        assert equal(c_state_before.history.begin(),
-                     c_state_before.history.end(),
-                     c_state_after.history.begin())
-
-    actions = []
-    while True:
-        step = len(actions)
-
-        step_actions = []
-        for before_state, after_state in zip(before_states, after_states):
-            c_state_before = before_state.c
-            c_state_after = after_state.c
-            if step < c_state_after.history.size() - c_state_before.history.size():
-                step_actions.append(c_state_after.history[c_state_before.history.size() + step])
-
-        # We are done if we have exhausted all histories.
-        if len(step_actions) == 0:
-            break
-
-        actions.append(numpy.array(step_actions, dtype="i"))
-
-    return actions
diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py
deleted file mode 100644
index 8c1cf7a9346..00000000000
--- a/spacy/tests/parser/test_model.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import numpy
-import pytest
-
-from spacy.lang.en import English
-from spacy.ml.tb_framework import TransitionModelInputs
-from spacy.training import Example
-
-TRAIN_DATA = [
-    (
-        "They trade mortgage-backed securities.",
-        {
-            "heads": [1, 1, 4, 4, 5, 1, 1],
-            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
-        },
-    ),
-    (
-        "I like London and Berlin.",
-        {
-            "heads": [1, 1, 1, 2, 2, 1],
-            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
-        },
-    ),
-]
-
-
-@pytest.fixture
-def nlp_parser():
-    nlp = English()
-    parser = nlp.add_pipe("parser")
-
-    train_examples = []
-    for text, annotations in TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-        for dep in annotations["deps"]:
-            parser.add_label(dep)
-    nlp.initialize()
-
-    return nlp, parser
-
-
-def test_incorrect_number_of_actions(nlp_parser):
-    nlp, parser = nlp_parser
-    doc = nlp.make_doc("test")
-
-    # Too many actions for the number of docs
-    with pytest.raises(AssertionError):
-        parser.model.predict(
-            TransitionModelInputs(
-                docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
-            )
-        )
-
-    # Too few actions for the number of docs
-    with pytest.raises(AssertionError):
-        parser.model.predict(
-            TransitionModelInputs(
-                docs=[doc, doc],
-                moves=parser.moves,
-                actions=[numpy.array([0], dtype="i")],
-            )
-        )
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index f0efc3a63fd..bb9b7653ce3 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -623,9 +623,7 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-@pytest.mark.slow
-@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
-def test_distill(max_moves):
+def test_distill():
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -643,7 +641,6 @@ def test_distill(max_moves):
 
     student = English()
     student_ner = student.add_pipe("ner")
-    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )

From edadf9e3299cf3e819ed6d70a42e49687f9de666 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:23:08 +0100
Subject: [PATCH 463/504] Revert "Merge the parser refactor into `v4` (#10940)"

This reverts commit a183db3cefe0713e828868b43daf61c464cae05c.
---
 setup.py                                      |   6 +-
 spacy/cli/templates/quickstart_training.jinja |  12 +-
 spacy/compat.py                               |   5 +
 spacy/ml/_precomputable_affine.py             | 164 +++++
 spacy/ml/tb_framework.pxd                     |  28 -
 spacy/ml/tb_framework.py                      |  50 ++
 spacy/ml/tb_framework.pyx                     | 639 ------------------
 .../_parser_internals/_parser_utils.pxd       |   2 -
 .../_parser_internals/_parser_utils.pyx       |  22 -
 spacy/pipeline/_parser_internals/_state.pxd   |  60 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 -
 spacy/pipeline/_parser_internals/batch.pxd    |   2 -
 spacy/pipeline/_parser_internals/batch.pyx    |  52 --
 spacy/pipeline/_parser_internals/ner.pyx      |   6 +-
 .../pipeline/_parser_internals/stateclass.pyx |   7 -
 .../_parser_internals/transition_system.pxd   |   7 -
 .../_parser_internals/transition_system.pyx   |  69 --
 .../{dep_parser.py => dep_parser.pyx}         |  20 +-
 spacy/pipeline/{ner.py => ner.pyx}            |  35 +-
 spacy/pipeline/transition_parser.pyx          | 500 ++++++++------
 spacy/tests/parser/test_ner.py                |  10 +-
 spacy/tests/parser/test_parse.py              |  74 +-
 spacy/tests/pipeline/test_tok2vec.py          |   2 +-
 .../tests/serialize/test_serialize_config.py  |  56 +-
 spacy/tests/test_misc.py                      |  55 +-
 website/docs/api/architectures.mdx            |  26 +-
 website/docs/api/legacy.mdx                   |   2 +-
 27 files changed, 683 insertions(+), 1231 deletions(-)
 create mode 100644 spacy/ml/_precomputable_affine.py
 delete mode 100644 spacy/ml/tb_framework.pxd
 create mode 100644 spacy/ml/tb_framework.py
 delete mode 100644 spacy/ml/tb_framework.pyx
 delete mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pxd
 delete mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pyx
 delete mode 100644 spacy/pipeline/_parser_internals/batch.pxd
 delete mode 100644 spacy/pipeline/_parser_internals/batch.pyx
 rename spacy/pipeline/{dep_parser.py => dep_parser.pyx} (96%)
 rename spacy/pipeline/{ner.py => ner.pyx} (93%)

diff --git a/setup.py b/setup.py
index a4a87d68aec..0eb529c2098 100755
--- a/setup.py
+++ b/setup.py
@@ -32,10 +32,12 @@
     "spacy.kb.candidate",
     "spacy.kb.kb",
     "spacy.kb.kb_in_memory",
-    "spacy.ml.tb_framework",
+    "spacy.ml.parser_model",
     "spacy.morphology",
+    "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
+    "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
     "spacy.pipeline.sentencizer",
@@ -43,7 +45,6 @@
     "spacy.pipeline.tagger",
     "spacy.pipeline.transition_parser",
     "spacy.pipeline._parser_internals.arc_eager",
-    "spacy.pipeline._parser_internals.batch",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
     "spacy.pipeline._parser_internals.search",
@@ -51,7 +52,6 @@
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.pipeline._parser_internals._beam_utils",
-    "spacy.pipeline._parser_internals._parser_utils",
     "spacy.tokenizer",
     "spacy.training.align",
     "spacy.training.gold_io",
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 36325711d4d..2817147f3e9 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -90,11 +90,12 @@ grad_factor = 1.0
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
+use_upper = false
 nO = null
 
 [components.parser.model.tok2vec]
@@ -110,11 +111,12 @@ grad_factor = 1.0
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = false
 nO = null
 
 [components.ner.model.tok2vec]
@@ -385,11 +387,12 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
+use_upper = true
 nO = null
 
 [components.parser.model.tok2vec]
@@ -402,11 +405,12 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
diff --git a/spacy/compat.py b/spacy/compat.py
index 1e63807a0e8..30459e2e495 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,6 +23,11 @@
 except ImportError:
     cupy = None
 
+if sys.version_info[:2] >= (3, 8):  # Python 3.8+
+    from typing import Literal, Protocol, runtime_checkable
+else:
+    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
+
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
new file mode 100644
index 00000000000..1c20c622b2c
--- /dev/null
+++ b/spacy/ml/_precomputable_affine.py
@@ -0,0 +1,164 @@
+from thinc.api import Model, normal_init
+
+from ..util import registry
+
+
+@registry.layers("spacy.PrecomputableAffine.v1")
+def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
+    model = Model(
+        "precomputable_affine",
+        forward,
+        init=init,
+        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
+        params={"W": None, "b": None, "pad": None},
+        attrs={"dropout_rate": dropout},
+    )
+    return model
+
+
+def forward(model, X, is_train):
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.get_param("W")
+    # Preallocate array for layer output, including padding.
+    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
+    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
+    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
+
+    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
+    # change its shape to (nF, nO, nP) without breaking existing models. So
+    # we'll squeeze the first dimension here.
+    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
+
+    def backward(dY_ids):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nO, nP), and get back:
+        # (nB, nO, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nO, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        assert dY.ndim == 3
+        assert dY.shape[1] == nO, dY.shape
+        assert dY.shape[2] == nP, dY.shape
+        # nB = dY.shape[0]
+        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
+        Xf = X[ids]
+        Xf = Xf.reshape((Xf.shape[0], nF * nI))
+
+        model.inc_grad("b", dY.sum(axis=0))
+        dY = dY.reshape((dY.shape[0], nO * nP))
+
+        Wopfi = W.transpose((1, 2, 0, 3))
+        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
+        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
+
+        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
+        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
+        # (o, p, f, i) --> (f, o, p, i)
+        dWopfi = dWopfi.transpose((2, 0, 1, 3))
+        model.inc_grad("W", dWopfi)
+        return dXf.reshape((dXf.shape[0], nF, nI))
+
+    return Yf, backward
+
+
+def _backprop_precomputable_affine_padding(model, dY, ids):
+    nB = dY.shape[0]
+    nF = model.get_dim("nF")
+    nP = model.get_dim("nP")
+    nO = model.get_dim("nO")
+    # Backprop the "padding", used as a filler for missing values.
+    # Values that are missing are set to -1, and each state vector could
+    # have multiple missing values. The padding has different values for
+    # different missing features. The gradient of the padding vector is:
+    #
+    # for b in range(nB):
+    #     for f in range(nF):
+    #         if ids[b, f] < 0:
+    #             d_pad[f] += dY[b]
+    #
+    # Which can be rewritten as:
+    #
+    # (ids < 0).T @ dY
+    mask = model.ops.asarray(ids < 0, dtype="f")
+    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
+    return d_pad.reshape((1, nF, nO, nP))
+
+
+def init(model, X=None, Y=None):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    if model.has_param("W") and model.get_param("W").any():
+        return
+
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nO, nP, nI)
+    b = model.ops.alloc2f(nO, nP)
+    pad = model.ops.alloc4f(1, nF, nO, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
+        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors = vectors.reshape((vectors.shape[0], nO, nP))
+        vectors += b
+        vectors = model.ops.asarray(vectors)
+        if nP >= 2:
+            return model.ops.maxout(vectors)[0]
+        else:
+            return vectors * (vectors >= 0)
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = model.get_param("W").copy()
+    b = model.get_param("b").copy()
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("b", b)
+        else:
+            break
diff --git a/spacy/ml/tb_framework.pxd b/spacy/ml/tb_framework.pxd
deleted file mode 100644
index 965508519e8..00000000000
--- a/spacy/ml/tb_framework.pxd
+++ /dev/null
@@ -1,28 +0,0 @@
-from libc.stdint cimport int8_t
-
-
-cdef struct SizesC:
-    int states
-    int classes
-    int hiddens
-    int pieces
-    int feats
-    int embed_width
-    int tokens
-
-
-cdef struct WeightsC:
-    const float* feat_weights
-    const float* feat_bias
-    const float* hidden_bias
-    const float* hidden_weights
-    const int8_t* seen_mask
-
-
-cdef struct ActivationsC:
-    int* token_ids
-    float* unmaxed
-    float* hiddens
-    int* is_valid
-    int _curr_size
-    int _max_size
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
new file mode 100644
index 00000000000..ab4a969e24e
--- /dev/null
+++ b/spacy/ml/tb_framework.py
@@ -0,0 +1,50 @@
+from thinc.api import Model, noop
+from .parser_model import ParserStepModel
+from ..util import registry
+
+
+@registry.layers("spacy.TransitionModel.v1")
+def TransitionModel(
+    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
+):
+    """Set up a stepwise transition-based model"""
+    if upper is None:
+        has_upper = False
+        upper = noop()
+    else:
+        has_upper = True
+    # don't define nO for this object, because we can't dynamically change it
+    return Model(
+        name="parser_model",
+        forward=forward,
+        dims={"nI": tok2vec.maybe_get_dim("nI")},
+        layers=[tok2vec, lower, upper],
+        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
+        init=init,
+        attrs={
+            "has_upper": has_upper,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def forward(model, X, is_train):
+    step_model = ParserStepModel(
+        X,
+        model.layers,
+        unseen_classes=model.attrs["unseen_classes"],
+        train=is_train,
+        has_upper=model.attrs["has_upper"],
+    )
+
+    return step_model, step_model.finish_steps
+
+
+def init(model, X=None, Y=None):
+    model.get_ref("tok2vec").initialize(X=X)
+    lower = model.get_ref("lower")
+    lower.initialize()
+    if model.attrs["has_upper"]:
+        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
+        model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
deleted file mode 100644
index b81553323e4..00000000000
--- a/spacy/ml/tb_framework.pyx
+++ /dev/null
@@ -1,639 +0,0 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, cast
-
-from libc.stdlib cimport calloc, free, realloc
-from libc.string cimport memcpy, memset
-from libcpp.vector cimport vector
-
-import numpy
-
-cimport numpy as np
-
-from thinc.api import (
-    Linear,
-    Model,
-    NumpyOps,
-    chain,
-    glorot_uniform_init,
-    list2array,
-    normal_init,
-    uniform_init,
-    zero_init,
-)
-
-from thinc.backends.cblas cimport CBlas, saxpy, sgemm
-
-from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
-
-from ..errors import Errors
-from ..pipeline._parser_internals import _beam_utils
-from ..pipeline._parser_internals.batch import GreedyBatch
-
-from ..pipeline._parser_internals._parser_utils cimport arg_max
-from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
-from ..pipeline._parser_internals.transition_system cimport (
-    TransitionSystem,
-    c_apply_actions,
-    c_transition_batch,
-)
-
-from ..tokens.doc import Doc
-from ..util import registry
-
-State = Any  # TODO
-
-
-@registry.layers("spacy.TransitionModel.v2")
-def TransitionModel(
-    *,
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    beam_width: int = 1,
-    beam_density: float = 0.0,
-    state_tokens: int,
-    hidden_width: int,
-    maxout_pieces: int,
-    nO: Optional[int] = None,
-    unseen_classes=set(),
-) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
-    """Set up a transition-based parsing model, using a maxout hidden
-    layer and a linear output layer.
-    """
-    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
-    tok2vec_projected.set_dim("nO", hidden_width)
-
-    # FIXME: we use `output` as a container for the output layer's
-    # weights and biases. Thinc optimizers cannot handle resizing
-    # of parameters. So, when the parser model is resized, we
-    # construct a new `output` layer, which has a different key in
-    # the optimizer. Once the optimizer supports parameter resizing,
-    # we can replace the `output` layer by `output_W` and `output_b`
-    # parameters in this model.
-    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
-
-    return Model(
-        name="parser_model",
-        forward=forward,
-        init=init,
-        layers=[tok2vec_projected, output],
-        refs={
-            "tok2vec": tok2vec_projected,
-            "output": output,
-        },
-        params={
-            "hidden_W": None,  # Floats2d W for the hidden layer
-            "hidden_b": None,  # Floats1d bias for the hidden layer
-            "hidden_pad": None,  # Floats1d padding for the hidden layer
-        },
-        dims={
-            "nO": None,  # Output size
-            "nP": maxout_pieces,
-            "nH": hidden_width,
-            "nI": tok2vec_projected.maybe_get_dim("nO"),
-            "nF": state_tokens,
-        },
-        attrs={
-            "beam_width": beam_width,
-            "beam_density": beam_density,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def resize_output(model: Model, new_nO: int) -> Model:
-    old_nO = model.maybe_get_dim("nO")
-    output = model.get_ref("output")
-    if old_nO is None:
-        model.set_dim("nO", new_nO)
-        output.set_dim("nO", new_nO)
-        output.initialize()
-        return model
-    elif new_nO <= old_nO:
-        return model
-    elif output.has_param("W"):
-        nH = model.get_dim("nH")
-        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
-        new_output.initialize()
-        new_W = new_output.get_param("W")
-        new_b = new_output.get_param("b")
-        old_W = output.get_param("W")
-        old_b = output.get_param("b")
-        new_W[:old_nO] = old_W  # type: ignore
-        new_b[:old_nO] = old_b  # type: ignore
-        for i in range(old_nO, new_nO):
-            model.attrs["unseen_classes"].add(i)
-        model.layers[-1] = new_output
-        model.set_ref("output", new_output)
-    # TODO: Avoid this private intrusion
-    model._dims["nO"] = new_nO
-    return model
-
-
-def init(
-    model,
-    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
-    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
-):
-    if X is not None:
-        docs, _ = X
-        model.get_ref("tok2vec").initialize(X=docs)
-    else:
-        model.get_ref("tok2vec").initialize()
-    inferred_nO = _infer_nO(Y)
-    if inferred_nO is not None:
-        current_nO = model.maybe_get_dim("nO")
-        if current_nO is None or current_nO != inferred_nO:
-            model.attrs["resize_output"](model, inferred_nO)
-    # nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nH = model.get_dim("nH")
-    nI = model.get_dim("nI")
-    nF = model.get_dim("nF")
-    ops = model.ops
-
-    Wl = ops.alloc2f(nH * nP, nF * nI)
-    bl = ops.alloc1f(nH * nP)
-    padl = ops.alloc1f(nI)
-    # Wl = zero_init(ops, Wl.shape)
-    Wl = glorot_uniform_init(ops, Wl.shape)
-    padl = uniform_init(ops, padl.shape)  # type: ignore
-    # TODO: Experiment with whether better to initialize output_W
-    model.set_param("hidden_W", Wl)
-    model.set_param("hidden_b", bl)
-    model.set_param("hidden_pad", padl)
-    # model = _lsuv_init(model)
-    return model
-
-
-class TransitionModelInputs:
-    """
-    Input to transition model.
-    """
-
-    # dataclass annotation is not yet supported in Cython 0.29.x,
-    # so, we'll do something close to it.
-
-    actions: Optional[List[Ints1d]]
-    docs: List[Doc]
-    max_moves: int
-    moves: TransitionSystem
-    states: Optional[List[State]]
-
-    __slots__ = [
-        "actions",
-        "docs",
-        "max_moves",
-        "moves",
-        "states",
-    ]
-
-    def __init__(
-        self,
-        docs: List[Doc],
-        moves: TransitionSystem,
-        actions: Optional[List[Ints1d]] = None,
-        max_moves: int = 0,
-        states: Optional[List[State]] = None,
-    ):
-        """
-        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
-        docs (List[Doc]): Docs to predict transition sequences for.
-        max_moves: (int): the maximum number of moves to apply, values less
-            than 1 will apply moves to states until they are final states.
-        moves (TransitionSystem): the transition system to use when predicting
-            the transition sequences.
-        states (Optional[List[States]]): the initial states to predict the
-            transition sequences for. When absent, the initial states are
-            initialized from the provided Docs.
-        """
-        self.actions = actions
-        self.docs = docs
-        self.moves = moves
-        self.max_moves = max_moves
-        self.states = states
-
-
-def forward(model, inputs: TransitionModelInputs, is_train: bool):
-    docs = inputs.docs
-    moves = inputs.moves
-    actions = inputs.actions
-
-    beam_width = model.attrs["beam_width"]
-    hidden_pad = model.get_param("hidden_pad")
-    tok2vec = model.get_ref("tok2vec")
-
-    states = moves.init_batch(docs) if inputs.states is None else inputs.states
-    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
-    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
-    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
-    seen_mask = _get_seen_mask(model)
-
-    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
-        # Note: max_moves is only used during training, so we don't need to
-        #       pass it to the greedy inference path.
-        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
-    else:
-        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
-                                 feats, backprop_feats, seen_mask, is_train, actions=actions,
-                                 max_moves=inputs.max_moves)
-
-
-def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
-                        np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None):
-    cdef vector[StateC*] c_states
-    cdef StateClass state
-    for state in states:
-        if not state.is_final():
-            c_states.push_back(state.c)
-    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
-    # Precomputed features have rows for each token, plus one for padding.
-    cdef int n_tokens = feats.shape[0] - 1
-    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
-    cdef CBlas cblas = model.ops.cblas()
-    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
-
-    def backprop(dY):
-        raise ValueError(Errors.E4004)
-
-    return (states, scores), backprop
-
-
-cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
-                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i
-    cdef vector[StateC *] unfinished
-    cdef ActivationsC activations = _alloc_activations(sizes)
-    cdef np.ndarray step_scores
-    cdef np.ndarray step_actions
-
-    scores = []
-    while sizes.states >= 1:
-        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
-        step_actions = actions[0] if actions is not None else None
-        with nogil:
-            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
-            if actions is None:
-                # Validate actions, argmax, take action.
-                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
-                                   sizes.states)
-            else:
-                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
-            for i in range(sizes.states):
-                if not states[i].is_final():
-                    unfinished.push_back(states[i])
-            for i in range(unfinished.size()):
-                states[i] = unfinished[i]
-        sizes.states = unfinished.size()
-        scores.append(step_scores)
-        unfinished.clear()
-        actions = actions[1:] if actions is not None else None
-    _free_activations(&activations)
-
-    return scores
-
-
-def _forward_fallback(
-    model: Model,
-    moves: TransitionSystem,
-    states: List[StateClass],
-    tokvecs, backprop_tok2vec,
-    feats,
-    backprop_feats,
-    seen_mask,
-    is_train: bool,
-    actions: Optional[List[Ints1d]] = None,
-        max_moves: int = 0):
-    nF = model.get_dim("nF")
-    output = model.get_ref("output")
-    hidden_b = model.get_param("hidden_b")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-
-    beam_width = model.attrs["beam_width"]
-    beam_density = model.attrs["beam_density"]
-
-    ops = model.ops
-
-    all_ids = []
-    all_which = []
-    all_statevecs = []
-    all_scores = []
-    if beam_width == 1:
-        batch = GreedyBatch(moves, states, None)
-    else:
-        batch = _beam_utils.BeamBatch(
-            moves, states, None, width=beam_width, density=beam_density
-        )
-    arange = ops.xp.arange(nF)
-    n_moves = 0
-    while not batch.is_done:
-        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
-        for i, state in enumerate(batch.get_unfinished_states()):
-            state.set_context_tokens(ids, i, nF)
-        # Sum the state features, add the bias and apply the activation (maxout)
-        # to create the state vectors.
-        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
-        preacts2f += hidden_b
-        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
-        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
-        statevecs, which = ops.maxout(preacts)
-        # We don't use output's backprop, since we want to backprop for
-        # all states at once, rather than a single state.
-        scores = output.predict(statevecs)
-        scores[:, seen_mask] = ops.xp.nanmin(scores)
-        # Transition the states, filtering out any that are finished.
-        cpu_scores = ops.to_numpy(scores)
-        if actions is None:
-            batch.advance(cpu_scores)
-        else:
-            batch.advance_with_actions(actions[0])
-            actions = actions[1:]
-        all_scores.append(scores)
-        if is_train:
-            # Remember intermediate results for the backprop.
-            all_ids.append(ids)
-            all_statevecs.append(statevecs)
-            all_which.append(which)
-        if n_moves >= max_moves >= 1:
-            break
-        n_moves += 1
-
-    def backprop_parser(d_states_d_scores):
-        ids = ops.xp.vstack(all_ids)
-        which = ops.xp.vstack(all_which)
-        statevecs = ops.xp.vstack(all_statevecs)
-        _, d_scores = d_states_d_scores
-        if model.attrs.get("unseen_classes"):
-            # If we have a negative gradient (i.e. the probability should
-            # increase) on any classes we filtered out as unseen, mark
-            # them as seen.
-            for clas in set(model.attrs["unseen_classes"]):
-                if (d_scores[:, clas] < 0).any():
-                    model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False  # no-cython-lint
-        # Calculate the gradients for the parameters of the output layer.
-        # The weight gemm is (nS, nO) @ (nS, nH).T
-        output.inc_grad("b", d_scores.sum(axis=0))
-        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
-        # Now calculate d_statevecs, by backproping through the output linear layer.
-        # This gemm is (nS, nO) @ (nO, nH)
-        output_W = output.get_param("W")
-        d_statevecs = ops.gemm(d_scores, output_W)
-        # Backprop through the maxout activation
-        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
-        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
-        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
-        # We don't need to backprop the summation, because we pass back the IDs instead
-        d_state_features = backprop_feats((d_preacts2f, ids))
-        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
-        ops.scatter_add(d_tokvecs, ids, d_state_features)
-        model.inc_grad("hidden_pad", d_tokvecs[-1])
-        return (backprop_tok2vec(d_tokvecs[:-1]), None)
-
-    return (list(batch), all_scores), backprop_parser
-
-
-def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
-    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
-    for class_ in model.attrs.get("unseen_classes", set()):
-        mask[class_] = True
-    return mask
-
-
-def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
-    W: Floats2d = model.get_param("hidden_W")
-    nF = model.get_dim("nF")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
-    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
-    W3f = W3f.transpose((1, 0, 2))
-    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
-    assert X.shape == (X.shape[0], nI), X.shape
-    Yf_ = model.ops.gemm(X, W2f, trans2=True)
-    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
-
-    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nH, nP), and get back:
-        # (nB, nH, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nH, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        dXf = model.ops.gemm(dY, W)
-        Xf = X[ids].reshape((ids.shape[0], -1))
-        dW = model.ops.gemm(dY, Xf, trans1=True)
-        model.inc_grad("hidden_W", dW)
-        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
-
-    return Yf, backward
-
-
-def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
-    if Y is None:
-        return None
-    _, scores = Y
-    if len(scores) == 0:
-        return None
-    assert scores[0].shape[0] >= 1
-    assert len(scores[0].shape) == 2
-    return scores[0].shape[1]
-
-
-def _lsuv_init(model: Model):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    W = model.maybe_get_param("hidden_W")
-    if W is not None and W.any():
-        return
-
-    nF = model.get_dim("nF")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nH, nP, nI)
-    b = model.ops.alloc2f(nH, nP)
-    pad = model.ops.alloc4f(1, nF, nH, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc_f((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc_f((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
-        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
-        vectors3f += b
-        return model.ops.maxout(vectors3f)[0]
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = cast(Floats4d, model.get_param("hidden_W").copy())
-    b = cast(Floats2d, model.get_param("hidden_b").copy())
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("hidden_W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("hidden_b", b)
-        else:
-            break
-    return model
-
-
-cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
-    output = model.get_ref("output")
-    cdef np.ndarray hidden_b = model.get_param("hidden_b")
-    cdef np.ndarray output_W = output.get_param("W")
-    cdef np.ndarray output_b = output.get_param("b")
-
-    cdef WeightsC weights
-    weights.feat_weights = feats
-    weights.feat_bias = <const float*>hidden_b.data
-    weights.hidden_weights = <const float *> output_W.data
-    weights.hidden_bias = <const float *> output_b.data
-    weights.seen_mask = <const int8_t*> seen_mask.data
-
-    return weights
-
-
-cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
-    cdef SizesC sizes
-    sizes.states = batch_size
-    sizes.classes = model.get_dim("nO")
-    sizes.hiddens = model.get_dim("nH")
-    sizes.pieces = model.get_dim("nP")
-    sizes.feats = model.get_dim("nF")
-    sizes.embed_width = model.get_dim("nI")
-    sizes.tokens = tokens
-    return sizes
-
-
-cdef ActivationsC _alloc_activations(SizesC n) nogil:
-    cdef ActivationsC A
-    memset(&A, 0, sizeof(A))
-    _resize_activations(&A, n)
-    return A
-
-
-cdef void _free_activations(const ActivationsC* A) nogil:
-    free(A.token_ids)
-    free(A.unmaxed)
-    free(A.hiddens)
-    free(A.is_valid)
-
-
-cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
-    if n.states <= A._max_size:
-        A._curr_size = n.states
-        return
-    if A._max_size == 0:
-        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
-        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
-        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    else:
-        A.token_ids = <int*>realloc(A.token_ids,
-                                    n.states * n.feats * sizeof(A.token_ids[0]))
-        A.unmaxed = <float*>realloc(A.unmaxed,
-                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>realloc(A.hiddens,
-                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
-        A.is_valid = <int*>realloc(A.is_valid,
-                                   n.states * n.classes * sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    A._curr_size = n.states
-
-
-cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
-    _resize_activations(A, n)
-    for i in range(n.states):
-        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
-    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
-    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
-    for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
-        for j in range(n.hiddens):
-            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = arg_max(&A.unmaxed[index], n.pieces)
-            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
-    if W.hidden_weights == NULL:
-        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
-    else:
-        # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-                     1.0, <const float *>A.hiddens, n.hiddens,
-                     <const float *>W.hidden_weights, n.hiddens,
-                     0.0, scores, n.classes)
-        # Add bias
-        for i in range(n.states):
-            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
-    # Set unseen classes to minimum value
-    i = 0
-    min_ = scores[0]
-    for i in range(1, n.states * n.classes):
-        if scores[i] < min_:
-            min_ = scores[i]
-    for i in range(n.states):
-        for j in range(n.classes):
-            if W.seen_mask[j]:
-                scores[i*n.classes+j] = min_
-
-
-cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
-                              const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f
-    cdef const float* feature
-    cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces  # no-cython-lint
-    cdef int F = n.feats
-    cdef int T = n.tokens
-    padding = cached + (T * F * O)
-    cdef int id_stride = F*O
-    cdef float one = 1.
-    for b in range(B):
-        for f in range(F):
-            if token_ids[f] < 0:
-                feature = &padding[f*O]
-            else:
-                idx = token_ids[f] * id_stride + f*O
-                feature = &cached[idx]
-            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
-        token_ids += F
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pxd b/spacy/pipeline/_parser_internals/_parser_utils.pxd
deleted file mode 100644
index 7fee05bad60..00000000000
--- a/spacy/pipeline/_parser_internals/_parser_utils.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef int arg_max(const float* scores, const int n_classes) nogil
-cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pyx b/spacy/pipeline/_parser_internals/_parser_utils.pyx
deleted file mode 100644
index 582756bf5be..00000000000
--- a/spacy/pipeline/_parser_internals/_parser_utils.pyx
+++ /dev/null
@@ -1,22 +0,0 @@
-# cython: infer_types=True
-
-cdef inline int arg_max(const float* scores, const int n_classes) nogil:
-    if n_classes == 2:
-        return 0 if scores[0] > scores[1] else 1
-    cdef int i
-    cdef int best = 0
-    cdef float mode = scores[0]
-    for i in range(1, n_classes):
-        if scores[i] > mode:
-            mode = scores[i]
-            best = i
-    return best
-
-
-cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
-    cdef int best = -1
-    for i in range(n):
-        if is_valid[i] >= 1:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 1c61ac271d8..04274ce8af1 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -7,6 +7,8 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.set cimport set
+from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
@@ -26,7 +28,7 @@ cdef struct ArcC:
 
 
 cdef cppclass StateC:
-    vector[int] _heads
+    int* _heads
     const TokenC* _sent
     vector[int] _stack
     vector[int] _rebuffer
@@ -34,34 +36,31 @@ cdef cppclass StateC:
     unordered_map[int, vector[ArcC]] _left_arcs
     unordered_map[int, vector[ArcC]] _right_arcs
     vector[libcpp.bool] _unshiftable
-    vector[int] history
     set[int] _sent_starts
     TokenC _empty_token
     int length
     int offset
     int _b_i
 
-    __init__(const TokenC* sent, int length) nogil except +:
-        this._heads.resize(length, -1)
-        this._unshiftable.resize(length, False)
-
-        # Reserve memory ahead of time to minimize allocations during parsing.
-        # The initial capacity set here ideally reflects the expected average-case/majority usage.
-        cdef int init_capacity = 32
-        this._stack.reserve(init_capacity)
-        this._rebuffer.reserve(init_capacity)
-        this._ents.reserve(init_capacity)
-        this._left_arcs.reserve(init_capacity)
-        this._right_arcs.reserve(init_capacity)
-        this.history.reserve(init_capacity)
-
+    __init__(const TokenC* sent, int length) nogil:
         this._sent = sent
+        this._heads = <int*>calloc(length, sizeof(int))
+        if not (this._sent and this._heads):
+            with gil:
+                PyErr_SetFromErrno(MemoryError)
+                PyErr_CheckSignals()
         this.offset = 0
         this.length = length
         this._b_i = 0
+        for i in range(length):
+            this._heads[i] = -1
+            this._unshiftable.push_back(0)
         memset(&this._empty_token, 0, sizeof(TokenC))
         this._empty_token.lex = &EMPTY_LEXEME
 
+    __dealloc__():
+        free(this._heads)
+
     void set_context_tokens(int* ids, int n) nogil:
         cdef int i, j
         if n == 1:
@@ -134,20 +133,19 @@ cdef cppclass StateC:
                 ids[i] = -1
 
     int S(int i) nogil const:
-        cdef int stack_size = this._stack.size()
-        if i >= stack_size or i < 0:
+        if i >= this._stack.size():
             return -1
-        else:
-            return this._stack[stack_size - (i+1)]
+        elif i < 0:
+            return -1
+        return this._stack.at(this._stack.size() - (i+1))
 
     int B(int i) nogil const:
-        cdef int buf_size = this._rebuffer.size()
         if i < 0:
             return -1
-        elif i < buf_size:
-            return this._rebuffer[buf_size - (i+1)]
+        elif i < this._rebuffer.size():
+            return this._rebuffer.at(this._rebuffer.size() - (i+1))
         else:
-            b_i = this._b_i + (i - buf_size)
+            b_i = this._b_i + (i - this._rebuffer.size())
             if b_i >= this.length:
                 return -1
             else:
@@ -246,7 +244,7 @@ cdef cppclass StateC:
             return 0
         elif this._sent[word].sent_start == 1:
             return 1
-        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
+        elif this._sent_starts.count(word) >= 1:
             return 1
         else:
             return 0
@@ -330,7 +328,7 @@ cdef cppclass StateC:
         if item >= this._unshiftable.size():
             return 0
         else:
-            return this._unshiftable[item]
+            return this._unshiftable.at(item)
 
     void set_reshiftable(int item) nogil:
         if item < this._unshiftable.size():
@@ -350,9 +348,6 @@ cdef cppclass StateC:
         this._heads[child] = head
 
     void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
-        cdef vector[ArcC]* arcs
-        cdef ArcC* arc
-
         arcs_it = heads_arcs.find(h_i)
         if arcs_it == heads_arcs.end():
             return
@@ -361,12 +356,12 @@ cdef cppclass StateC:
         if arcs.size() == 0:
             return
 
-        arc = &arcs.back()
+        arc = arcs.back()
         if arc.head == h_i and arc.child == c_i:
             arcs.pop_back()
         else:
             for i in range(arcs.size()-1):
-                arc = &deref(arcs)[i]
+                arc = arcs.at(i)
                 if arc.head == h_i and arc.child == c_i:
                     arc.head = -1
                     arc.child = -1
@@ -406,11 +401,10 @@ cdef cppclass StateC:
         this._rebuffer = src._rebuffer
         this._sent_starts = src._sent_starts
         this._unshiftable = src._unshiftable
-        this._heads = src._heads
+        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
         this._ents = src._ents
         this._left_arcs = src._left_arcs
         this._right_arcs = src._right_arcs
         this._b_i = src._b_i
         this.offset = src.offset
         this._empty_token = src._empty_token
-        this.history = src.history
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 462aa820e4f..9dda3bd5e44 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -779,8 +779,6 @@ cdef class ArcEager(TransitionSystem):
         return list(arcs)
 
     def has_gold(self, Example eg, start=0, end=None):
-        if end is not None and end < 0:
-            end = None
         for word in eg.y[start:end]:
             if word.dep != 0:
                 return True
@@ -865,7 +863,6 @@ cdef class ArcEager(TransitionSystem):
                             state.print_state()
                         )))
                     action.do(state.c, action.label)
-                    state.c.history.push_back(i)
                     break
             else:
                 failed = False
diff --git a/spacy/pipeline/_parser_internals/batch.pxd b/spacy/pipeline/_parser_internals/batch.pxd
deleted file mode 100644
index 60734e549aa..00000000000
--- a/spacy/pipeline/_parser_internals/batch.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef class Batch:
-    pass
diff --git a/spacy/pipeline/_parser_internals/batch.pyx b/spacy/pipeline/_parser_internals/batch.pyx
deleted file mode 100644
index 91073b52e68..00000000000
--- a/spacy/pipeline/_parser_internals/batch.pyx
+++ /dev/null
@@ -1,52 +0,0 @@
-from typing import Any
-
-TransitionSystem = Any  # TODO
-
-cdef class Batch:
-    def advance(self, scores):
-        raise NotImplementedError
-
-    def get_states(self):
-        raise NotImplementedError
-
-    @property
-    def is_done(self):
-        raise NotImplementedError
-
-    def get_unfinished_states(self):
-        raise NotImplementedError
-
-    def __getitem__(self, i):
-        raise NotImplementedError
-
-    def __len__(self):
-        raise NotImplementedError
-
-
-class GreedyBatch(Batch):
-    def __init__(self, moves: TransitionSystem, states, golds):
-        self._moves = moves
-        self._states = states
-        self._next_states = [s for s in states if not s.is_final()]
-
-    def advance(self, scores):
-        self._next_states = self._moves.transition_states(self._next_states, scores)
-
-    def advance_with_actions(self, actions):
-        self._next_states = self._moves.apply_actions(self._next_states, actions)
-
-    def get_states(self):
-        return self._states
-
-    @property
-    def is_done(self):
-        return all(s.is_final() for s in self._states)
-
-    def get_unfinished_states(self):
-        return [st for st in self._states if not st.is_final()]
-
-    def __getitem__(self, i):
-        return self._states[i]
-
-    def __len__(self):
-        return len(self._states)
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index bd4e06dedb3..0a79e77cb86 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -166,7 +166,7 @@ cdef class BiluoPushDown(TransitionSystem):
             if token.ent_type:
                 labels.add(token.ent_type_)
         return labels
-
+    
     def move_name(self, int move, attr_t label):
         if move == OUT:
             return 'O'
@@ -316,8 +316,6 @@ cdef class BiluoPushDown(TransitionSystem):
             for span in eg.y.spans.get(neg_key, []):
                 if span.start >= start and span.end <= end:
                     return True
-        if end is not None and end < 0:
-            end = None
         for word in eg.y[start:end]:
             if word.ent_iob != 0:
                 return True
@@ -653,7 +651,7 @@ cdef class Unit:
                 cost += 1
                 break
         return cost
-
+ 
 
 
 cdef class Out:
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index c2a0d22956a..f25408a13ba 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -21,10 +21,6 @@ cdef class StateClass:
         if self._borrowed != 1:
             del self.c
 
-    @property
-    def history(self):
-        return list(self.c.history)
-
     @property
     def stack(self):
         return [self.S(i) for i in range(self.c.stack_depth())]
@@ -181,6 +177,3 @@ cdef class StateClass:
 
     def clone(self, StateClass src):
         self.c.clone(src.c)
-
-    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
-        self.c.set_context_tokens(&output[row, 0], n_feats)
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 08baed932ba..04cd10d8864 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -57,10 +57,3 @@ cdef class TransitionSystem:
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        const StateC* state, gold) except -1
-
-
-cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-                          int batch_size) nogil
-
-cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-                             int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index ae1cf890f3e..4a0feb435dd 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -3,8 +3,6 @@
 from __future__ import print_function
 
 from cymem.cymem cimport Pool
-from libc.stdlib cimport calloc, free
-from libcpp.vector cimport vector
 
 from collections import Counter
 
@@ -76,18 +74,7 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
-    def follow_history(self, doc, history):
-        cdef int clas
-        cdef StateClass state = StateClass(doc)
-        for clas in history:
-            action = self.c[clas]
-            action.do(state.c, action.label)
-            state.c.history.push_back(clas)
-        return state
-
     def get_oracle_sequence(self, Example example, _debug=False):
-        if not self.has_gold(example):
-            return []
         states, golds, _ = self.init_gold_batch([example])
         if not states:
             return []
@@ -99,8 +86,6 @@ cdef class TransitionSystem:
             return self.get_oracle_sequence_from_state(state, gold)
 
     def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
-        if state.is_final():
-            return []
         cdef Pool mem = Pool()
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         assert self.n_moves > 0
@@ -126,7 +111,6 @@ cdef class TransitionSystem:
                             "S0 head?", str(state.has_head(state.S(0))),
                         )))
                     action.do(state.c, action.label)
-                    state.c.history.push_back(i)
                     break
             else:
                 if _debug:
@@ -154,28 +138,6 @@ cdef class TransitionSystem:
             raise ValueError(Errors.E170.format(name=name))
         action = self.lookup_transition(name)
         action.do(state.c, action.label)
-        state.c.history.push_back(action.clas)
-
-    def apply_actions(self, states, const int[::1] actions):
-        assert len(states) == actions.shape[0]
-        cdef StateClass state
-        cdef vector[StateC*] c_states
-        c_states.resize(len(states))
-        cdef int i
-        for (i, state) in enumerate(states):
-            c_states[i] = state.c
-        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
-        return [state for state in states if not state.c.is_final()]
-
-    def transition_states(self, states, float[:, ::1] scores):
-        assert len(states) == scores.shape[0]
-        cdef StateClass state
-        cdef float* c_scores = &scores[0, 0]
-        cdef vector[StateC*] c_states
-        for state in states:
-            c_states.push_back(state.c)
-        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
-        return [state for state in states if not state.c.is_final()]
 
     cdef Transition lookup_transition(self, object name) except *:
         raise NotImplementedError
@@ -288,34 +250,3 @@ cdef class TransitionSystem:
             self.cfg.update(msg['cfg'])
         self.initialize_actions(labels)
         return self
-
-
-cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-                          int batch_size) nogil:
-    cdef int i
-    cdef Transition action
-    cdef StateC* state
-    for i in range(batch_size):
-        state = states[i]
-        action = moves.c[actions[i]]
-        action.do(state, action.label)
-        state.history.push_back(action.clas)
-
-
-cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-                             int nr_class, int batch_size) nogil:
-    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
-    cdef int i, guess
-    cdef Transition action
-    for i in range(batch_size):
-        moves.set_valid(is_valid, states[i])
-        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
-        if guess == -1:
-            # This shouldn't happen, but it's hard to raise an error here,
-            # and we don't want to infinite loop. So, force to end state.
-            states[i].force_final()
-        else:
-            action = moves.c[guess]
-            action.do(states[i], action.label)
-            states[i].history.push_back(guess)
-    free(is_valid)
diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.pyx
similarity index 96%
rename from spacy/pipeline/dep_parser.py
rename to spacy/pipeline/dep_parser.pyx
index b4961487b83..3e59deaae4f 100644
--- a/spacy/pipeline/dep_parser.py
+++ b/spacy/pipeline/dep_parser.pyx
@@ -4,6 +4,10 @@
 
 from thinc.api import Config, Model
 
+from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser cimport Parser
+from ._parser_internals.arc_eager cimport ArcEager
+
 from ..language import Language
 from ..scorer import Scorer
 from ..training import remove_bilu_prefix
@@ -17,11 +21,12 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -121,7 +126,6 @@ def make_parser(
         scorer=scorer,
     )
 
-
 @Language.factory(
     "beam_parser",
     assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
@@ -227,7 +231,6 @@ def parser_score(examples, **kwargs):
 
     DOCS: https://spacy.io/api/dependencyparser#score
     """
-
     def has_sents(doc):
         return doc.has_annotation("SENT_START")
 
@@ -235,11 +238,8 @@ def dep_getter(token, attr):
         dep = getattr(token, attr)
         dep = token.vocab.strings.as_string(dep).lower()
         return dep
-
     results = {}
-    results.update(
-        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
-    )
+    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
     kwargs.setdefault("getter", dep_getter)
     kwargs.setdefault("ignore_labels", ("p", "punct"))
     results.update(Scorer.score_deps(examples, "dep", **kwargs))
@@ -252,12 +252,11 @@ def make_parser_scorer():
     return parser_score
 
 
-class DependencyParser(Parser):
+cdef class DependencyParser(Parser):
     """Pipeline component for dependency parsing.
 
     DOCS: https://spacy.io/api/dependencyparser
     """
-
     TransitionSystem = ArcEager
 
     def __init__(
@@ -277,7 +276,8 @@ def __init__(
         incorrect_spans_key=None,
         scorer=parser_score,
     ):
-        """Create a DependencyParser."""
+        """Create a DependencyParser.
+        """
         super().__init__(
             vocab,
             model,
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.pyx
similarity index 93%
rename from spacy/pipeline/ner.py
rename to spacy/pipeline/ner.pyx
index 1c7cf151385..c73ec5c528a 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.pyx
@@ -10,15 +10,22 @@
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
+from .transition_parser cimport Parser
+from ._parser_internals.ner cimport BiluoPushDown
+from ..language import Language
+from ..scorer import get_ner_prf, PRFScore
+from ..util import registry
+from ..training import remove_bilu_prefix
+
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -43,12 +50,8 @@
         "incorrect_spans_key": None,
         "scorer": {"@scorers": "spacy.ner_scorer.v1"},
     },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+
 )
 def make_ner(
     nlp: Language,
@@ -101,7 +104,6 @@ def make_ner(
         scorer=scorer,
     )
 
-
 @Language.factory(
     "beam_ner",
     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
@@ -115,12 +117,7 @@ def make_ner(
         "incorrect_spans_key": None,
         "scorer": None,
     },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 )
 def make_beam_ner(
     nlp: Language,
@@ -194,12 +191,11 @@ def make_ner_scorer():
     return ner_score
 
 
-class EntityRecognizer(Parser):
+cdef class EntityRecognizer(Parser):
     """Pipeline component for named entity recognition.
 
     DOCS: https://spacy.io/api/entityrecognizer
     """
-
     TransitionSystem = BiluoPushDown
 
     def __init__(
@@ -217,14 +213,15 @@ def __init__(
         incorrect_spans_key=None,
         scorer=ner_score,
     ):
-        """Create an EntityRecognizer."""
+        """Create an EntityRecognizer.
+        """
         super().__init__(
             vocab,
             model,
             name,
             moves,
             update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,  # not relevant for NER
+            min_action_freq=1,   # not relevant for NER
             learn_tokens=False,  # not relevant for NER
             beam_width=beam_width,
             beam_density=beam_density,
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 68add292459..329a5b0ca85 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,16 +1,21 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
-
 from typing import Dict, Iterable, List, Optional, Tuple
-
-cimport numpy as np
 from cymem.cymem cimport Pool
-
-import contextlib
-import random
+cimport numpy as np
 from itertools import islice
+from libcpp.vector cimport vector
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free
+import random
 
+import srsly
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
+from thinc.api import chain, softmax_activation, use_ops
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.types import Floats2d
+import numpy.random
 import numpy
 import numpy.random
 import srsly
@@ -24,7 +29,16 @@ from thinc.api import (
 )
 from thinc.types import Floats2d, Ints1d
 
-from ..ml.tb_framework import TransitionModelInputs
+from ._parser_internals.stateclass cimport StateClass
+from ._parser_internals.search cimport Beam
+from ..ml.parser_model cimport alloc_activations, free_activations
+from ..ml.parser_model cimport predict_states, arg_max_if_valid
+from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
+from ..ml.parser_model cimport get_c_weights, get_c_sizes
+from ..tokens.doc cimport Doc
+from .trainable_pipe import TrainablePipe
+from ._parser_internals cimport _beam_utils
+from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
 from ..typedefs cimport weight_t
@@ -74,7 +88,7 @@ cdef extern from "<algorithm>" namespace "std" nogil:
 NUMPY_OPS = NumpyOps()
 
 
-class Parser(TrainablePipe):
+cdef class Parser(TrainablePipe):
     """
     Base class of the DependencyParser and EntityRecognizer.
     """
@@ -175,9 +189,8 @@ class Parser(TrainablePipe):
     @property
     def move_names(self):
         names = []
-        cdef TransitionSystem moves = self.moves
         for i in range(self.moves.n_moves):
-            name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
+            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
             # Explicitly removing the internal "U-" token used for blocking entities
             if name != "U-":
                 names.append(name)
@@ -284,6 +297,15 @@ class Parser(TrainablePipe):
 
         student_docs = [eg.predicted for eg in examples]
 
+        teacher_step_model = teacher_pipe.model.predict([eg.reference for eg in examples])
+        student_step_model, backprop_tok2vec = self.model.begin_update(student_docs)
+
+        # Add softmax activation, so that we can compute student losses
+        # with cross-entropy loss.
+        with use_ops("numpy"):
+            teacher_model = chain(teacher_step_model, softmax_activation())
+            student_model = chain(student_step_model, softmax_activation())
+        
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
@@ -291,38 +313,50 @@ class Parser(TrainablePipe):
             # sequence, we use the teacher's predictions as the gold
             # standard.
             max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states = self._init_batch(teacher_pipe, student_docs, max_moves)
+            states = self._init_batch(teacher_step_model, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
 
-        # We distill as follows: 1. we first let the student predict transition
-        # sequences (and the corresponding transition probabilities); (2) we
-        # let the teacher follow the student's predicted transition sequences
-        # to obtain the teacher's transition probabilities; (3) we compute the
-        # gradients of the student's transition distributions relative to the
-        # teacher's distributions.
-
-        student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
-            max_moves=max_moves)
-        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
-        teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
-            moves=self.moves, actions=actions)
-        (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
+        loss = 0.0
+        n_moves = 0
+        while states:
+            # We do distillation as follows: (1) for every state, we compute the
+            # transition softmax distributions: (2) we backpropagate the error of
+            # the student (compared to the teacher) into the student model; (3)
+            # for all states, we move to the next state using the student's
+            # predictions.
+            teacher_scores = teacher_model.predict(states)
+            student_scores, backprop = student_model.begin_update(states)
+            state_loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
+            backprop(d_scores)
+            loss += state_loss
+            self.transition_states(states, student_scores)
+            states = [state for state in states if not state.is_final()]
+
+            # Stop when we reach the maximum number of moves, otherwise we start
+            # to process the remainder of cut sequences again.
+            if max_moves >= 1 and n_moves >= max_moves:
+                break
+            n_moves += 1
 
-        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
-        backprop_scores((student_states, d_scores))
+        backprop_tok2vec(student_docs)
 
         if sgd is not None:
             self.finish_update(sgd)
 
         losses[self.name] += loss
 
+        del backprop
+        del backprop_tok2vec
+        teacher_step_model.clear_memory()
+        student_step_model.clear_memory()
+        del teacher_model
+        del student_model
+
         return losses
 
     def get_teacher_student_loss(
-            self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
-            normalize: bool = False,
+        self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
     ) -> Tuple[float, List[Floats2d]]:
         """Calculate the loss and its gradient for a batch of student
         scores, relative to teacher scores.
@@ -334,28 +368,10 @@ class Parser(TrainablePipe):
 
         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
         """
-
-        # We can't easily hook up a softmax layer in the parsing model, since
-        # the get_loss does additional masking. So, we could apply softmax
-        # manually here and use Thinc's cross-entropy loss. But it's a bit
-        # suboptimal, since we can have a lot of states that would result in
-        # many kernel launches. Futhermore the parsing model's backprop expects
-        # a XP array, so we'd have to concat the softmaxes anyway. So, like
-        # the get_loss implementation, we'll compute the loss and gradients
-        # ourselves.
-
-        teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
-                                                axis=-1, inplace=True)
-        student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
-                                                axis=-1, inplace=True)
-
-        assert teacher_scores.shape == student_scores.shape
-
-        d_scores = student_scores - teacher_scores
-        if normalize:
-            d_scores /= d_scores.shape[0]
-        loss = (d_scores**2).sum() / d_scores.size
-
+        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        d_scores, loss = loss_func(student_scores, teacher_scores)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError(Errors.E910.format(name=self.name))
         return float(loss), d_scores
 
     def init_multitask_objectives(self, get_examples, pipeline, **cfg):
@@ -378,6 +394,9 @@ class Parser(TrainablePipe):
 
         stream: The sequence of documents to process.
         batch_size (int): Number of documents to accumulate into a working set.
+        error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
+            deals with a failing batch of documents. The default function just reraises
+            the exception.
 
         YIELDS (Doc): Documents, in order.
         """
@@ -398,31 +417,78 @@ class Parser(TrainablePipe):
     def predict(self, docs):
         if isinstance(docs, Doc):
             docs = [docs]
-        self._ensure_labels_are_added(docs)
         if not any(len(doc) for doc in docs):
             result = self.moves.init_batch(docs)
             return result
-        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
-            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-            states_or_beams, _ = self.model.predict(inputs)
-        return states_or_beams
+        if self.cfg["beam_width"] == 1:
+            return self.greedy_parse(docs, drop=0.0)
+        else:
+            return self.beam_parse(
+                docs,
+                drop=0.0,
+                beam_width=self.cfg["beam_width"],
+                beam_density=self.cfg["beam_density"]
+            )
 
     def greedy_parse(self, docs, drop=0.):
         cdef vector[StateC*] states
         cdef StateClass state
         cdef CBlas cblas = self._cpu_ops.cblas()
         self._ensure_labels_are_added(docs)
-        with _change_attrs(self.model, beam_width=1):
-            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-            states, _ = self.model.predict(inputs)
-        return states
+        set_dropout_rate(self.model, drop)
+        batch = self.moves.init_batch(docs)
+        model = self.model.predict(docs)
+        weights = get_c_weights(model)
+        for state in batch:
+            if not state.is_final():
+                states.push_back(state.c)
+        sizes = get_c_sizes(model, states.size())
+        with nogil:
+            self._parseC(cblas, &states[0], weights, sizes)
+        model.clear_memory()
+        del model
+        return batch
 
     def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
+        cdef Beam beam
+        cdef Doc doc
         self._ensure_labels_are_added(docs)
-        with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
-            inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-            beams, _ = self.model.predict(inputs)
-        return beams
+        batch = _beam_utils.BeamBatch(
+            self.moves,
+            self.moves.init_batch(docs),
+            None,
+            beam_width,
+            density=beam_density
+        )
+        model = self.model.predict(docs)
+        while not batch.is_done:
+            states = batch.get_unfinished_states()
+            if not states:
+                break
+            scores = model.predict(states)
+            batch.advance(scores)
+        model.clear_memory()
+        del model
+        return list(batch)
+
+    cdef void _parseC(self, CBlas cblas, StateC** states,
+            WeightsC weights, SizesC sizes) nogil:
+        cdef int i, j
+        cdef vector[StateC*] unfinished
+        cdef ActivationsC activations = alloc_activations(sizes)
+        while sizes.states >= 1:
+            predict_states(cblas, &activations, states, &weights, sizes)
+            # Validate actions, argmax, take action.
+            self.c_transition_batch(states,
+                activations.scores, sizes.classes, sizes.states)
+            for i in range(sizes.states):
+                if not states[i].is_final():
+                    unfinished.push_back(states[i])
+            for i in range(unfinished.size()):
+                states[i] = unfinished[i]
+            sizes.states = unfinished.size()
+            unfinished.clear()
+        free_activations(&activations)
 
     def set_annotations(self, docs, states_or_beams):
         cdef StateClass state
@@ -433,6 +499,35 @@ class Parser(TrainablePipe):
             for hook in self.postprocesses:
                 hook(doc)
 
+    def transition_states(self, states, float[:, ::1] scores):
+        cdef StateClass state
+        cdef float* c_scores = &scores[0, 0]
+        cdef vector[StateC*] c_states
+        for state in states:
+            c_states.push_back(state.c)
+        self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
+        return [state for state in states if not state.c.is_final()]
+
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
+            int nr_class, int batch_size) nogil:
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        with gil:
+            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
+        is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
+        cdef int i, guess
+        cdef Transition action
+        for i in range(batch_size):
+            self.moves.set_valid(is_valid, states[i])
+            guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
+            if guess == -1:
+                # This shouldn't happen, but it's hard to raise an error here,
+                # and we don't want to infinite loop. So, force to end state.
+                states[i].force_final()
+            else:
+                action = self.moves.c[guess]
+                action.do(states[i], action.label)
+        free(is_valid)
+
     def update(self, examples, *, drop=0., sgd=None, losses=None):
         if losses is None:
             losses = {}
@@ -443,99 +538,67 @@ class Parser(TrainablePipe):
         )
         for multitask in self._multitasks:
             multitask.update(examples, drop=drop, sgd=sgd)
-        # We need to take care to act on the whole batch, because we might be
-        # getting vectors via a listener.
         n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
         if n_examples == 0:
             return losses
         set_dropout_rate(self.model, drop)
-        docs = [eg.x for eg in examples if len(eg.x)]
-
+        # The probability we use beam update, instead of falling back to
+        # a greedy update
+        beam_update_prob = self.cfg["beam_update_prob"]
+        if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
+            return self.update_beam(
+                examples,
+                beam_width=self.cfg["beam_width"],
+                sgd=sgd,
+                losses=losses,
+                beam_density=self.cfg["beam_density"]
+            )
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
             # batch uniform length.
-            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
-            init_states, gold_states, _ = self._init_gold_batch(
+            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            states, golds, _ = self._init_gold_batch(
                 examples,
                 max_length=max_moves
             )
         else:
-            init_states, gold_states, _ = self.moves.init_gold_batch(examples)
-
-        inputs = TransitionModelInputs(docs=docs,
-                                       moves=self.moves,
-                                       max_moves=max_moves,
-                                       states=[state.copy() for state in init_states])
-        (pred_states, scores), backprop_scores = self.model.begin_update(inputs)
-        if sum(s.shape[0] for s in scores) == 0:
+            states, golds, _ = self.moves.init_gold_batch(examples)
+        if not states:
             return losses
-        d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
-                                 examples, max_moves)
-        backprop_scores((pred_states, d_scores))
+        model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
+ 
+        all_states = list(states)
+        states_golds = list(zip(states, golds))
+        n_moves = 0
+        while states_golds:
+            states, golds = zip(*states_golds)
+            scores, backprop = model.begin_update(states)
+            d_scores = self.get_batch_loss(states, golds, scores, losses)
+            # Note that the gradient isn't normalized by the batch size
+            # here, because our "samples" are really the states...But we
+            # can't normalize by the number of states either, as then we'd
+            # be getting smaller gradients for states in long sequences.
+            backprop(d_scores)
+            # Follow the predicted action
+            self.transition_states(states, scores)
+            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
+            if max_moves >= 1 and n_moves >= max_moves:
+                break
+            n_moves += 1
+
+        backprop_tok2vec(golds)
         if sgd not in (None, False):
             self.finish_update(sgd)
-        losses[self.name] += float((d_scores**2).sum())
         # Ugh, this is annoying. If we're working on GPU, we want to free the
         # memory ASAP. It seems that Python doesn't necessarily get around to
         # removing these in time if we don't explicitly delete? It's confusing.
-        del backprop_scores
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        del model
         return losses
 
-    def get_loss(self, states_scores, examples, max_moves):
-        gold_states, init_states, pred_states, scores = states_scores
-        scores = self.model.ops.xp.vstack(scores)
-        costs = self._get_costs_from_histories(
-            examples,
-            gold_states,
-            init_states,
-            [list(state.history) for state in pred_states],
-            max_moves
-        )
-        xp = get_array_module(scores)
-        best_costs = costs.min(axis=1, keepdims=True)
-        gscores = scores.copy()
-        min_score = scores.min() - 1000
-        assert costs.shape == scores.shape, (costs.shape, scores.shape)
-        gscores[costs > best_costs] = min_score
-        max_ = scores.max(axis=1, keepdims=True)
-        gmax = gscores.max(axis=1, keepdims=True)
-        exp_scores = xp.exp(scores - max_)
-        exp_gscores = xp.exp(gscores - gmax)
-        Z = exp_scores.sum(axis=1, keepdims=True)
-        gZ = exp_gscores.sum(axis=1, keepdims=True)
-        d_scores = exp_scores / Z
-        d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
-        return d_scores
-
-    def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
-        cdef TransitionSystem moves = self.moves
-        cdef StateClass state
-        cdef int clas
-        cdef int nO = moves.n_moves
-        cdef Pool mem = Pool()
-        cdef np.ndarray costs_i
-        is_valid = <int*>mem.alloc(nO, sizeof(int))
-        batch = list(zip(init_states, histories, gold_states))
-        n_moves = 0
-        output = []
-        while batch:
-            costs = numpy.zeros((len(batch), nO), dtype="f")
-            for i, (state, history, gold) in enumerate(batch):
-                costs_i = costs[i]
-                clas = history.pop(0)
-                moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
-                action = moves.c[clas]
-                action.do(state.c, action.label)
-                state.c.history.push_back(clas)
-            output.append(costs)
-            batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
-            if n_moves >= max_moves >= 1:
-                break
-            n_moves += 1
-
-        return self.model.ops.xp.vstack(output)
-
     def rehearse(self, examples, sgd=None, losses=None, **cfg):
         """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
         if losses is None:
@@ -545,9 +608,10 @@ class Parser(TrainablePipe):
                 multitask.rehearse(examples, losses=losses, sgd=sgd)
         if self._rehearsal_model is None:
             return None
-        losses.setdefault(self.name, 0.0)
+        losses.setdefault(self.name, 0.)
         validate_examples(examples, "Parser.rehearse")
         docs = [eg.predicted for eg in examples]
+        states = self.moves.init_batch(docs)
         # This is pretty dirty, but the NER can resize itself in init_batch,
         # if labels are missing. We therefore have to check whether we need to
         # expand our model output.
@@ -555,33 +619,85 @@ class Parser(TrainablePipe):
         # Prepare the stepwise model, and get the callback for finishing the batch
         set_dropout_rate(self._rehearsal_model, 0.0)
         set_dropout_rate(self.model, 0.0)
-        student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
-        (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
-        actions = states2actions(student_states)
-        teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
-        _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
-
-        loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
-
-        teacher_scores = self.model.ops.xp.vstack(teacher_scores)
-        student_scores = self.model.ops.xp.vstack(student_scores)
-        assert teacher_scores.shape == student_scores.shape
-
-        d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
-        # If all weights for an output are 0 in the original model, don't
-        # supervise that output. This allows us to add classes.
-        loss = (d_scores**2).sum() / d_scores.size
-        backprop_scores((student_states, d_scores))
-
+        tutor, _ = self._rehearsal_model.begin_update(docs)
+        model, backprop_tok2vec = self.model.begin_update(docs)
+        n_scores = 0.
+        loss = 0.
+        while states:
+            targets, _ = tutor.begin_update(states)
+            guesses, backprop = model.begin_update(states)
+            d_scores = (guesses - targets) / targets.shape[0]
+            # If all weights for an output are 0 in the original model, don't
+            # supervise that output. This allows us to add classes.
+            loss += (d_scores**2).sum()
+            backprop(d_scores)
+            # Follow the predicted action
+            self.transition_states(states, guesses)
+            states = [state for state in states if not state.is_final()]
+            n_scores += d_scores.size
+        # Do the backprop
+        backprop_tok2vec(docs)
         if sgd is not None:
             self.finish_update(sgd)
-        losses[self.name] += loss
-
+        losses[self.name] += loss / n_scores
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        tutor.clear_memory()
+        del model
+        del tutor
         return losses
 
-    def update_beam(self, examples, *, beam_width, drop=0.,
-                    sgd=None, losses=None, beam_density=0.0):
-        raise NotImplementedError
+    def update_beam(self, examples, *, beam_width,
+            drop=0., sgd=None, losses=None, beam_density=0.0):
+        states, golds, _ = self.moves.init_gold_batch(examples)
+        if not states:
+            return losses
+        # Prepare the stepwise model, and get the callback for finishing the batch
+        model, backprop_tok2vec = self.model.begin_update(
+            [eg.predicted for eg in examples])
+        loss = _beam_utils.update_beam(
+            self.moves,
+            states,
+            golds,
+            model,
+            beam_width,
+            beam_density=beam_density,
+        )
+        losses[self.name] += loss
+        backprop_tok2vec(golds)
+        if sgd is not None:
+            self.finish_update(sgd)
+
+    def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
+        cdef StateClass state
+        cdef Pool mem = Pool()
+        cdef int i
+
+        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+        assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
+
+        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
+        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
+        cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
+                                        dtype='f', order='C')
+        c_d_scores = <float*>d_scores.data
+        unseen_classes = self.model.attrs["unseen_classes"]
+        for i, (state, gold) in enumerate(zip(states, golds)):
+            memset(is_valid, 0, self.moves.n_moves * sizeof(int))
+            memset(costs, 0, self.moves.n_moves * sizeof(float))
+            self.moves.set_costs(is_valid, costs, state.c, gold)
+            for j in range(self.moves.n_moves):
+                if costs[j] <= 0.0 and j in unseen_classes:
+                    unseen_classes.remove(j)
+            cpu_log_loss(c_d_scores,
+                costs, is_valid, &scores[i, 0], d_scores.shape[1])
+            c_d_scores += d_scores.shape[1]
+        # Note that we don't normalize this. See comment in update() for why.
+        if losses is not None:
+            losses.setdefault(self.name, 0.)
+            losses[self.name] += (d_scores**2).sum()
+        return d_scores
 
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
@@ -620,7 +736,7 @@ class Parser(TrainablePipe):
             for example in islice(get_examples(), 10):
                 doc_sample.append(example.predicted)
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        self.model.initialize((doc_sample, self.moves))
+        self.model.initialize(doc_sample)
         if nlp is not None:
             self.init_multitask_objectives(get_examples, nlp.pipeline)
 
@@ -713,27 +829,26 @@ class Parser(TrainablePipe):
 
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
-        sequence or a cap. A long doc will get multiple states. Let's say we
-        have a doc of length 2*N, where N is the shortest doc. We'll make
-        two states, one representing long_doc[:N], and another representing
-        long_doc[N:]."""
+        sequence or a cap. A long
+        doc will get multiple states. Let's say we have a doc of length 2*N,
+        where N is the shortest doc. We'll make two states, one representing
+        long_doc[:N], and another representing long_doc[N:]."""
         cdef:
             StateClass start_state
             StateClass state
             Transition action
-            TransitionSystem moves = self.moves
-        all_states = moves.init_batch([eg.predicted for eg in examples])
+        all_states = self.moves.init_batch([eg.predicted for eg in examples])
         states = []
         golds = []
         to_cut = []
         for state, eg in zip(all_states, examples):
-            if moves.has_gold(eg) and not state.is_final():
-                gold = moves.init_gold(state, eg)
+            if self.moves.has_gold(eg) and not state.is_final():
+                gold = self.moves.init_gold(state, eg)
                 if len(eg.x) < max_length:
                     states.append(state)
                     golds.append(gold)
                 else:
-                    oracle_actions = moves.get_oracle_sequence_from_state(
+                    oracle_actions = self.moves.get_oracle_sequence_from_state(
                         state.copy(), gold)
                     to_cut.append((eg, state, gold, oracle_actions))
         if not to_cut:
@@ -743,52 +858,13 @@ class Parser(TrainablePipe):
             for i in range(0, len(oracle_actions), max_length):
                 start_state = state.copy()
                 for clas in oracle_actions[i:i+max_length]:
-                    action = moves.c[clas]
+                    action = self.moves.c[clas]
                     action.do(state.c, action.label)
                     if state.is_final():
                         break
-                if moves.has_gold(eg, start_state.B(0), state.B(0)):
+                if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
                     states.append(start_state)
                     golds.append(gold)
                 if state.is_final():
                     break
         return states, golds, max_length
-
-
-@contextlib.contextmanager
-def _change_attrs(model, **kwargs):
-    """Temporarily modify a thinc model's attributes."""
-    unset = object()
-    old_attrs = {}
-    for key, value in kwargs.items():
-        old_attrs[key] = model.attrs.get(key, unset)
-        model.attrs[key] = value
-    yield model
-    for key, value in old_attrs.items():
-        if value is unset:
-            model.attrs.pop(key)
-        else:
-            model.attrs[key] = value
-
-
-def states2actions(states: List[StateClass]) -> List[Ints1d]:
-    cdef int step
-    cdef StateClass state
-    cdef StateC* c_state
-    actions = []
-    while True:
-        step = len(actions)
-
-        step_actions = []
-        for state in states:
-            c_state = state.c
-            if step < c_state.history.size():
-                step_actions.append(c_state.history[step])
-
-        # We are done if we have exhausted all histories.
-        if len(step_actions) == 0:
-            break
-
-        actions.append(numpy.array(step_actions, dtype="i"))
-
-    return actions
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index bb9b7653ce3..5d5fbb02790 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,6 +17,7 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
+import logging
 
 from ..util import make_tempdir
 
@@ -413,7 +414,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.initialize(get_examples=lambda: train_examples)
+    nlp.initialize()
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -540,11 +541,11 @@ def test_block_ner():
     assert [token.ent_type_ for token in doc] == expected_types
 
 
-def test_overfitting_IO():
-    fix_random_seed(1)
+@pytest.mark.parametrize("use_upper", [True, False])
+def test_overfitting_IO(use_upper):
     # Simple test to try and quickly overfit the NER component
     nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {}})
+    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -576,6 +577,7 @@ def test_overfitting_IO():
         assert ents2[0].label_ == "LOC"
         # Ensure that the predictions are still the same, even after adding a new label
         ner2 = nlp2.get_pipe("ner")
+        assert ner2.model.attrs["has_upper"] == use_upper
         ner2.add_label("RANDOM_NEW_LABEL")
         doc3 = nlp2(test_text)
         ents3 = doc3.ents
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index fe82ad2fde0..f63d56f6922 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,6 +1,3 @@
-import itertools
-
-import numpy
 import pytest
 from numpy.testing import assert_equal
 from thinc.api import Adam, fix_random_seed
@@ -62,8 +59,6 @@
     ),
 ]
 
-PARSERS = ["parser"]  # TODO: Test beam_parser when ready
-
 eps = 0.1
 
 
@@ -176,57 +171,6 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
     assert doc[0].dep != 0
 
 
-def test_parser_apply_actions(en_vocab, en_parser):
-    words = ["I", "ate", "pizza"]
-    words2 = ["Eat", "more", "pizza", "!"]
-    doc1 = Doc(en_vocab, words=words)
-    doc2 = Doc(en_vocab, words=words2)
-    docs = [doc1, doc2]
-
-    moves = en_parser.moves
-    moves.add_action(0, "")
-    moves.add_action(1, "")
-    moves.add_action(2, "nsubj")
-    moves.add_action(3, "obj")
-    moves.add_action(2, "amod")
-
-    actions = [
-        numpy.array([0, 0], dtype="i"),
-        numpy.array([2, 0], dtype="i"),
-        numpy.array([0, 4], dtype="i"),
-        numpy.array([3, 3], dtype="i"),
-        numpy.array([1, 1], dtype="i"),
-        numpy.array([1, 1], dtype="i"),
-        numpy.array([0], dtype="i"),
-        numpy.array([1], dtype="i"),
-    ]
-
-    states = moves.init_batch(docs)
-    active_states = states
-
-    for step_actions in actions:
-        active_states = moves.apply_actions(active_states, step_actions)
-
-    assert len(active_states) == 0
-
-    for (state, doc) in zip(states, docs):
-        moves.set_annotations(state, doc)
-
-    assert docs[0][0].head.i == 1
-    assert docs[0][0].dep_ == "nsubj"
-    assert docs[0][1].head.i == 1
-    assert docs[0][1].dep_ == "ROOT"
-    assert docs[0][2].head.i == 1
-    assert docs[0][2].dep_ == "obj"
-
-    assert docs[1][0].head.i == 0
-    assert docs[1][0].dep_ == "ROOT"
-    assert docs[1][1].head.i == 2
-    assert docs[1][1].dep_ == "amod"
-    assert docs[1][2].head.i == 0
-    assert docs[1][2].dep_ == "obj"
-
-
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
@@ -375,7 +319,7 @@ def test_parser_constructor(en_vocab):
     DependencyParser(en_vocab, model)
 
 
-@pytest.mark.parametrize("pipe_name", PARSERS)
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
 def test_incomplete_data(pipe_name):
     # Test that the parser works with incomplete information
     nlp = English()
@@ -401,15 +345,11 @@ def test_incomplete_data(pipe_name):
     assert doc[2].head.i == 1
 
 
-@pytest.mark.parametrize(
-    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
-)
-def test_overfitting_IO(pipe_name, max_moves):
-    fix_random_seed(0)
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+def test_overfitting_IO(pipe_name):
     # Simple test to try and quickly overfit the dependency parser (normal or beam)
     nlp = English()
     parser = nlp.add_pipe(pipe_name)
-    parser.cfg["update_with_oracle_cut_size"] = max_moves
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -515,12 +455,10 @@ def test_distill(max_moves):
 @pytest.mark.parametrize(
     "parser_config",
     [
-        # TODO: re-enable after we have a spacy-legacy release for v4. See
-        # https://github.com/explosion/spacy-legacy/pull/36
-        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        # TransitionBasedParser V1
+        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        # TransitionBasedParser V2
         ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
-        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
     ],
 )
 # fmt: on
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index f6cefbc1f84..c3c4bb6c686 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -384,7 +384,7 @@ def test_replace_listeners():
     factory = "ner"
 
     [components.ner.model]
-    @architectures = "spacy.TransitionBasedParser.v3"
+    @architectures = "spacy.TransitionBasedParser.v2"
 
     [components.ner.model.tok2vec]
     @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index b351ea80121..8a1c74ca9ed 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -189,11 +189,33 @@
 
 parser_config_string_upper = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
+use_upper = true
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 333
+depth = 4
+embed_size = 5555
+window_size = 1
+maxout_pieces = 7
+subword_features = false
+"""
+
+
+parser_config_string_no_upper = """
+[model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "parser"
+extra_state_tokens = false
+hidden_width = 66
+maxout_pieces = 2
+use_upper = false
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
@@ -224,6 +246,7 @@ def my_parser():
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
+        use_upper=True,
     )
     return parser
 
@@ -337,16 +360,15 @@ def test_serialize_custom_nlp():
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        assert model.get_ref("tok2vec") is not None
-        assert model.has_param("hidden_W")
-        assert model.has_param("hidden_b")
-        output = model.get_ref("output")
-        assert output is not None
-        assert output.has_param("W")
-        assert output.has_param("b")
+        model.get_ref("tok2vec")
+        # check that we have the correct settings, not the default ones
+        assert model.get_ref("upper").get_dim("nI") == 65
+        assert model.get_ref("lower").get_dim("nI") == 65
 
 
-@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
+@pytest.mark.parametrize(
+    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
+)
 def test_serialize_parser(parser_config_string):
     """Create a non-default parser config to check nlp serializes it correctly"""
     nlp = English()
@@ -359,13 +381,11 @@ def test_serialize_parser(parser_config_string):
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        assert model.get_ref("tok2vec") is not None
-        assert model.has_param("hidden_W")
-        assert model.has_param("hidden_b")
-        output = model.get_ref("output")
-        assert output is not None
-        assert output.has_param("b")
-        assert output.has_param("W")
+        model.get_ref("tok2vec")
+        # check that we have the correct settings, not the default ones
+        if model.attrs["has_upper"]:
+            assert model.get_ref("upper").get_dim("nI") == 66
+        assert model.get_ref("lower").get_dim("nI") == 66
 
 
 def test_config_nlp_roundtrip():
@@ -561,7 +581,9 @@ def test_config_auto_fill_extra_fields():
     load_model_from_config(nlp.config)
 
 
-@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
+@pytest.mark.parametrize(
+    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
+)
 def test_config_validate_literal(parser_config_string):
     nlp = English()
     config = Config().from_str(parser_config_string)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index c05ef625e11..44717f6eba2 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,19 +1,22 @@
 import ctypes
 import os
 from pathlib import Path
-
 import pytest
-from pydantic import ValidationError
-from thinc.api import (
-    Config,
-    ConfigValidationError,
-    CupyOps,
-    MPSOps,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-    set_current_ops,
-)
+
+try:
+    from pydantic.v1 import ValidationError
+except ImportError:
+    from pydantic import ValidationError  # type: ignore
+
+from spacy.about import __version__ as spacy_version
+from spacy import util
+from spacy import prefer_gpu, require_gpu, require_cpu
+from spacy.ml._precomputable_affine import PrecomputableAffine
+from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
+from spacy.util import dot_to_object, SimpleFrozenList, import_file
+from spacy.util import to_ternary_int, find_available_port
+from thinc.api import Config, Optimizer, ConfigValidationError
+from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
@@ -96,6 +99,34 @@ def test_util_get_package_path(package):
     assert isinstance(path, Path)
 
 
+def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
+    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
+    assert model.get_param("W").shape == (nF, nO, nP, nI)
+    tensor = model.ops.alloc((10, nI))
+    Y, get_dX = model.begin_update(tensor)
+    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
+    dY = model.ops.alloc((15, nO, nP))
+    ids = model.ops.alloc((15, nF))
+    ids[1, 2] = -1
+    dY[1] = 1
+    assert not model.has_grad("pad")
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 2, 0, 0] == 1.0
+    ids.fill(0.0)
+    dY.fill(0.0)
+    dY[0] = 0
+    ids[1, 2] = 0
+    ids[1, 1] = -1
+    ids[1, 0] = -1
+    dY[1] = 1
+    ids[2, 0] = -1
+    dY[2] = 5
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 0, 0, 0] == 6
+    assert d_pad[0, 1, 0, 0] == 1
+    assert d_pad[0, 2, 0, 0] == 0
+
+
 def test_prefer_gpu():
     current_ops = get_current_ops()
     if has_cupy_gpu:
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index db8f974ea19..956234ac0d4 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -833,17 +833,18 @@ for a Tok2Vec layer.
 
 ## Parser & NER architectures {id="parser"}
 
-### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
+### spacy.TransitionBasedParser.v2 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TransitionBasedParser.v3"
+> @architectures = "spacy.TransitionBasedParser.v2"
 > state_type = "ner"
 > extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
+> use_upper = true
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v2"
@@ -873,22 +874,23 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                 | Description                                                                                                                                                       |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                        |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                               |
-| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
-| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                            |
-| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~                                                             |
-| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                       |
-| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                     |
+| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
+| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
 
 <Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>
 
 [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
 same signature, but the `use_upper` argument was `True` by default.
 
- </Accordion>
+</Accordion>
 
 ## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx
index 44c80622437..b44df538766 100644
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@@ -302,7 +302,7 @@ the others, but may not be as accurate, especially if texts are short.
 ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 
 Identical to
-[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser)
+[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
 except the `use_upper` was set to `True` by default.
 
 ## Layers {id="layers"}

From 9add7d494b15169fc09994233b337ccec973c3c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:24:09 +0100
Subject: [PATCH 464/504] isort

---
 spacy/ml/tb_framework.py                    |  3 +-
 spacy/pipeline/_parser_internals/_state.pxd |  3 +-
 spacy/pipeline/dep_parser.pyx               |  3 +-
 spacy/pipeline/ner.pyx                      |  9 +++--
 spacy/pipeline/transition_parser.pyx        | 45 +++++++++++++--------
 spacy/tests/parser/test_ner.py              |  1 -
 spacy/tests/test_misc.py                    | 20 ++++-----
 7 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
index ab4a969e24e..e351ad4e570 100644
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@@ -1,6 +1,7 @@
 from thinc.api import Model, noop
-from .parser_model import ParserStepModel
+
 from ..util import registry
+from .parser_model import ParserStepModel
 
 
 @registry.layers("spacy.TransitionModel.v1")
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 04274ce8af1..c063cf97cd4 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,4 +1,5 @@
 cimport libcpp
+from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as incr
 from libc.stdint cimport uint32_t, uint64_t
@@ -7,8 +8,6 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.set cimport set
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 3e59deaae4f..1fdc55dab41 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -5,8 +5,9 @@ from typing import Callable, Iterable, Optional
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
+
 from ._parser_internals.arc_eager cimport ArcEager
+from .transition_parser cimport Parser
 
 from ..language import Language
 from ..scorer import Scorer
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index c73ec5c528a..29f22b0174d 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -10,13 +10,14 @@ from ..training import remove_bilu_prefix, validate_examples
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
+
 from ._parser_internals.ner cimport BiluoPushDown
+from .transition_parser cimport Parser
+
 from ..language import Language
-from ..scorer import get_ner_prf, PRFScore
-from ..util import registry
+from ..scorer import PRFScore, get_ner_prf
 from ..training import remove_bilu_prefix
-
+from ..util import registry
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 329a5b0ca85..e78902f9775 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,21 +1,20 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 # cython: profile=False
 from __future__ import print_function
+
 from typing import Dict, Iterable, List, Optional, Tuple
-from cymem.cymem cimport Pool
+
 cimport numpy as np
+from cymem.cymem cimport Pool
+
 from itertools import islice
-from libcpp.vector cimport vector
-from libc.string cimport memset, memcpy
+
 from libc.stdlib cimport calloc, free
+from libc.string cimport memcpy, memset
+from libcpp.vector cimport vector
+
 import random
 
-import srsly
-from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
-from thinc.api import chain, softmax_activation, use_ops
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d
-import numpy.random
 import numpy
 import numpy.random
 import srsly
@@ -23,21 +22,36 @@ from thinc.api import (
     CupyOps,
     NumpyOps,
     Optimizer,
+    chain,
     get_array_module,
     get_ops,
     set_dropout_rate,
+    softmax_activation,
+    use_ops,
 )
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 
-from ._parser_internals.stateclass cimport StateClass
-from ._parser_internals.search cimport Beam
-from ..ml.parser_model cimport alloc_activations, free_activations
-from ..ml.parser_model cimport predict_states, arg_max_if_valid
-from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
-from ..ml.parser_model cimport get_c_weights, get_c_sizes
+from ..ml.parser_model cimport (
+    ActivationsC,
+    SizesC,
+    WeightsC,
+    alloc_activations,
+    arg_max_if_valid,
+    cpu_log_loss,
+    free_activations,
+    get_c_sizes,
+    get_c_weights,
+    predict_states,
+)
 from ..tokens.doc cimport Doc
+from ._parser_internals.search cimport Beam
+from ._parser_internals.stateclass cimport StateClass
+
 from .trainable_pipe import TrainablePipe
+
 from ._parser_internals cimport _beam_utils
+
 from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
@@ -84,7 +98,6 @@ cdef extern from "<algorithm>" namespace "std" nogil:
     bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
 
 
-
 NUMPY_OPS = NumpyOps()
 
 
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 5d5fbb02790..b6848d380fe 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,7 +17,6 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
-import logging
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 44717f6eba2..d2a41ff0fed 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,6 +1,7 @@
 import ctypes
 import os
 from pathlib import Path
+
 import pytest
 
 try:
@@ -8,15 +9,16 @@
 except ImportError:
     from pydantic import ValidationError  # type: ignore
 
-from spacy.about import __version__ as spacy_version
-from spacy import util
-from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.ml._precomputable_affine import PrecomputableAffine
-from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
-from spacy.util import dot_to_object, SimpleFrozenList, import_file
-from spacy.util import to_ternary_int, find_available_port
-from thinc.api import Config, Optimizer, ConfigValidationError
-from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
+from thinc.api import (
+    Config,
+    ConfigValidationError,
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+    set_current_ops,
+)
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util

From 4e9ac540f89509d96e097e7ed87c943d2a695759 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:38:01 +0100
Subject: [PATCH 465/504] Add distillation tests with max cut size

And fix endless loop when the max cut size is 0 or 1.
---
 spacy/pipeline/transition_parser.pyx | 2 +-
 spacy/tests/parser/test_ner.py       | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index e78902f9775..63d36607475 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -325,7 +325,7 @@ cdef class Parser(TrainablePipe):
             # batch uniform length. Since we do not have a gold standard
             # sequence, we use the teacher's predictions as the gold
             # standard.
-            max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
+            max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
             states = self._init_batch(teacher_step_model, student_docs, max_moves)
         else:
             states = self.moves.init_batch(student_docs)
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index b6848d380fe..7c3a9d56249 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -624,7 +624,9 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -642,6 +644,7 @@ def test_distill():
 
     student = English()
     student_ner = student.add_pipe("ner")
+    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )

From db18664a08ca1dee7906e41d7a501fcd9d2a8186 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 18 Dec 2023 20:02:15 +0100
Subject: [PATCH 466/504] Fix Cython lints

---
 spacy/pipeline/_parser_internals/ner.pyx |  4 +--
 spacy/pipeline/dep_parser.pyx            |  1 +
 spacy/pipeline/ner.pyx                   |  3 +-
 spacy/pipeline/transition_parser.pyx     | 42 +++++++++---------------
 4 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 0a79e77cb86..a4f7094520c 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -166,7 +166,7 @@ cdef class BiluoPushDown(TransitionSystem):
             if token.ent_type:
                 labels.add(token.ent_type_)
         return labels
-    
+
     def move_name(self, int move, attr_t label):
         if move == OUT:
             return 'O'
@@ -651,7 +651,7 @@ cdef class Unit:
                 cost += 1
                 break
         return cost
- 
+
 
 
 cdef class Out:
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 1fdc55dab41..cbd7187ff0f 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -127,6 +127,7 @@ def make_parser(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_parser",
     assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 29f22b0174d..fe54d33a17b 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -15,7 +15,7 @@ from ._parser_internals.ner cimport BiluoPushDown
 from .transition_parser cimport Parser
 
 from ..language import Language
-from ..scorer import PRFScore, get_ner_prf
+from ..scorer import get_ner_prf
 from ..training import remove_bilu_prefix
 from ..util import registry
 
@@ -105,6 +105,7 @@ def make_ner(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_ner",
     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 63d36607475..23768a92fcb 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -10,7 +10,7 @@ from cymem.cymem cimport Pool
 from itertools import islice
 
 from libc.stdlib cimport calloc, free
-from libc.string cimport memcpy, memset
+from libc.string cimport memset
 from libcpp.vector cimport vector
 
 import random
@@ -23,14 +23,13 @@ from thinc.api import (
     NumpyOps,
     Optimizer,
     chain,
-    get_array_module,
     get_ops,
     set_dropout_rate,
     softmax_activation,
     use_ops,
 )
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
+from thinc.types import Floats2d
 
 from ..ml.parser_model cimport (
     ActivationsC,
@@ -45,7 +44,6 @@ from ..ml.parser_model cimport (
     predict_states,
 )
 from ..tokens.doc cimport Doc
-from ._parser_internals.search cimport Beam
 from ._parser_internals.stateclass cimport StateClass
 
 from .trainable_pipe import TrainablePipe
@@ -55,11 +53,10 @@ from ._parser_internals cimport _beam_utils
 from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
-from ..typedefs cimport weight_t
 from ..vocab cimport Vocab
 from ._parser_internals cimport _beam_utils
 from ._parser_internals.stateclass cimport StateC, StateClass
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ._parser_internals.transition_system cimport Transition
 from .trainable_pipe cimport TrainablePipe
 
 from .. import util
@@ -318,7 +315,7 @@ cdef class Parser(TrainablePipe):
         with use_ops("numpy"):
             teacher_model = chain(teacher_step_model, softmax_activation())
             student_model = chain(student_step_model, softmax_activation())
-        
+
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
@@ -463,8 +460,6 @@ cdef class Parser(TrainablePipe):
         return batch
 
     def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
-        cdef Beam beam
-        cdef Doc doc
         self._ensure_labels_are_added(docs)
         batch = _beam_utils.BeamBatch(
             self.moves,
@@ -485,15 +480,15 @@ cdef class Parser(TrainablePipe):
         return list(batch)
 
     cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil:
-        cdef int i, j
+                      WeightsC weights, SizesC sizes) nogil:
+        cdef int i
         cdef vector[StateC*] unfinished
         cdef ActivationsC activations = alloc_activations(sizes)
         while sizes.states >= 1:
             predict_states(cblas, &activations, states, &weights, sizes)
             # Validate actions, argmax, take action.
-            self.c_transition_batch(states,
-                activations.scores, sizes.classes, sizes.states)
+            self.c_transition_batch(states, activations.scores,
+                                    sizes.classes, sizes.states)
             for i in range(sizes.states):
                 if not states[i].is_final():
                     unfinished.push_back(states[i])
@@ -522,7 +517,7 @@ cdef class Parser(TrainablePipe):
         return [state for state in states if not state.c.is_final()]
 
     cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil:
+                                 int nr_class, int batch_size) nogil:
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         with gil:
             assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
@@ -580,8 +575,7 @@ cdef class Parser(TrainablePipe):
         if not states:
             return losses
         model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
- 
-        all_states = list(states)
+
         states_golds = list(zip(states, golds))
         n_moves = 0
         while states_golds:
@@ -661,8 +655,8 @@ cdef class Parser(TrainablePipe):
         del tutor
         return losses
 
-    def update_beam(self, examples, *, beam_width,
-            drop=0., sgd=None, losses=None, beam_density=0.0):
+    def update_beam(self, examples, *, beam_width, drop=0., sgd=None,
+                    losses=None, beam_density=0.0):
         states, golds, _ = self.moves.init_gold_batch(examples)
         if not states:
             return losses
@@ -693,7 +687,7 @@ cdef class Parser(TrainablePipe):
         is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
         costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
         cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
-                                        dtype='f', order='C')
+                                               dtype='f', order='C')
         c_d_scores = <float*>d_scores.data
         unseen_classes = self.model.attrs["unseen_classes"]
         for i, (state, gold) in enumerate(zip(states, golds)):
@@ -703,8 +697,8 @@ cdef class Parser(TrainablePipe):
             for j in range(self.moves.n_moves):
                 if costs[j] <= 0.0 and j in unseen_classes:
                     unseen_classes.remove(j)
-            cpu_log_loss(c_d_scores,
-                costs, is_valid, &scores[i, 0], d_scores.shape[1])
+            cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0],
+                         d_scores.shape[1])
             c_d_scores += d_scores.shape[1]
         # Note that we don't normalize this. See comment in update() for why.
         if losses is not None:
@@ -814,10 +808,7 @@ cdef class Parser(TrainablePipe):
         long_doc[:N], and another representing long_doc[N:]. In contrast to
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
-        cdef:
-            StateClass start_state
-            StateClass state
-            Transition action
+        cdef StateClass state
         all_states = self.moves.init_batch(docs)
         states = []
         to_cut = []
@@ -839,7 +830,6 @@ cdef class Parser(TrainablePipe):
                 length += 1
         return states
 
-
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
         sequence or a cap. A long

From 5f2298ec9696ffc3c5ca1e97582647cd3b2eac84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 09:47:38 +0100
Subject: [PATCH 467/504] No need for `Literal` compat, since we only support
 >= 3.8

---
 spacy/compat.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/spacy/compat.py b/spacy/compat.py
index 30459e2e495..1e63807a0e8 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,11 +23,6 @@
 except ImportError:
     cupy = None
 
-if sys.version_info[:2] >= (3, 8):  # Python 3.8+
-    from typing import Literal, Protocol, runtime_checkable
-else:
-    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
-
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle

From 63a9e00430747687f011ab82e04a33527a36adc4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 16 Jan 2024 14:54:26 +0100
Subject: [PATCH 468/504] Update thinc dependency to 9.0.0.dev4

---
 pyproject.toml                       | 2 +-
 requirements.txt                     | 2 +-
 spacy/pipeline/morphologizer.pyx     | 3 ++-
 spacy/pipeline/transition_parser.pyx | 4 ++--
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3891d137867..0a5bc162773 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev2,<9.1.0",
+    "thinc>=9.0.0.dev4,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 3a1ef6b70b4..0c852a0d01c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev2,<9.1.0
+thinc>=9.0.0.dev4,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index f822c38ac0e..77c643d4630 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -302,7 +302,8 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
+                                                    label_smoothing=self.cfg["label_smoothing"])
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 23768a92fcb..39cdc3c812a 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -22,13 +22,13 @@ from thinc.api import (
     CupyOps,
     NumpyOps,
     Optimizer,
+    SequenceCategoricalCrossentropy,
     chain,
     get_ops,
     set_dropout_rate,
     softmax_activation,
     use_ops,
 )
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d
 
 from ..ml.parser_model cimport (
@@ -378,7 +378,7 @@ cdef class Parser(TrainablePipe):
 
         DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
         """
-        loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(normalize=False)
         d_scores, loss = loss_func(student_scores, teacher_scores)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))

From 0199f0959d1aa59088b67b85652a9c22de004cef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 2 Jan 2024 10:03:06 +0100
Subject: [PATCH 469/504] Add spacy.TextCatParametricAttention.v1 (#13201)

* Add spacy.TextCatParametricAttention.v1

This layer provides is a simplification of the ensemble classifier that
only uses paramteric attention. We have found empirically that with a
sufficient amount of training data, using the ensemble classifier with
BoW does not provide significant improvement in classifier accuracy.
However, plugging in a BoW classifier does reduce GPU training and
inference performance substantially, since it uses a GPU-only kernel.

* Fix merge fallout
---
 pyproject.toml   | 5 +++--
 requirements.txt | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0a5bc162773..bfd7e68d1f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,8 +5,9 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev4,<9.1.0",
-    "numpy>=1.15.0",
+    "thinc>=8.2.2,<8.3.0",
+    "numpy>=1.15.0; python_version < '3.9'",
+    "numpy>=1.25.0; python_version >= '3.9'",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index 0c852a0d01c..94a9d17c0c3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev4,<9.1.0
+thinc>=8.2.2,<8.3.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0

From 21809727df98badb1a2291c0dae9806208e1178f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 24 Jan 2024 10:28:46 +0100
Subject: [PATCH 470/504] Typing fixes

---
 spacy/tokens/span.pyi | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 2a529593e5f..f1030278c69 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -139,6 +139,8 @@ class Span:
     def lemma_(self) -> str: ...
     @property
     def label_(self) -> str: ...
+    @label_.setter
+    def label_(self, label: str): ...
     @property
     def kb_id_(self) -> str: ...
     @property

From f436b68644a52c30c14273689d69fcdcf87be6d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 12 Jan 2022 13:38:52 +0100
Subject: [PATCH 471/504] Span/SpanGroup: wrap SpanC in shared_ptr (#9869)

* Span/SpanGroup: wrap SpanC in shared_ptr

When a Span that was retrieved from a SpanGroup was modified, these
changes were not reflected in the SpanGroup because the underlying
SpanC struct was copied.

This change applies the solution proposed by @nrodnova, to wrap SpanC in
a shared_ptr. This makes a SpanGroup and Spans derived from it share the
same SpanC. So, changes made through a Span are visible in the SpanGroup
as well.

Fixes #9556

* Test that a SpanGroup is modified through its Spans

* SpanGroup.push_back: remove nogil

Modifying std::vector is not thread-safe.

* C++ >= 11 does not allow const T in vector<T>

* Add Span.span_c as a shorthand for Span.c.get

Since this method is cdef'ed, it is only visible from Cython, so we
avoid using raw pointers in Python

Replace existing uses of span.c.get() to use this new method.

* Fix formatting

* Style fix: pointer types

* SpanGroup.to_bytes: reduce number of shared_ptr::get calls

* Mark SpanGroup modification test with issue

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tokens/span.pxd | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index fb592e68bd8..68f722a13cb 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,3 +1,4 @@
+from libcpp.memory cimport shared_ptr
 cimport numpy as np
 from libcpp.memory cimport shared_ptr
 

From 7ea5859fc83b45d7df8e02f6d5ac490bdb7fba84 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 15 Jul 2022 11:14:08 +0200
Subject: [PATCH 472/504] `Morphology`/`Morphologizer` optimizations and
 refactoring (#11024)

* `Morphology`: Refactor to use C types, reduce allocations, remove unused code

* `Morphologzier`: Avoid unnecessary sorting of morpho features

* `Morphologizer`: Remove execessive reallocations of labels, improve hash lookups of labels, coerce `numpy` numeric types to native ints
Update docs

* Remove unused method

* Replace `unique_ptr` usage with `shared_ptr`

* Add type annotations to internal Python methods, rename `hash` variable, fix typos

* Add comment to clarify implementation detail

* Fix return type

* `Morphology`: Stop early when splitting fields and values
---
 spacy/pipeline/morphologizer.pyx | 3 +--
 spacy/tokens/morphanalysis.pyx   | 7 +++++++
 spacy/tokens/token.pyx           | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 77c643d4630..cc8f87936b9 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -302,8 +302,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
-                                                    label_smoothing=self.cfg["label_smoothing"])
+        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index e5665496ff4..b4f7ffbb0d9 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -9,6 +9,13 @@ from libcpp.memory cimport shared_ptr
 from ..morphology cimport MorphAnalysisC, check_feature, get_by_field, list_features
 from ..typedefs cimport attr_t, hash_t
 from ..vocab cimport Vocab
+from ..typedefs cimport hash_t, attr_t
+from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
+from libcpp.memory cimport shared_ptr
+from cython.operator cimport dereference as deref
+
+
+cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
 
 
 cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index d9fdcb75263..f6c6ad8b9a1 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -40,6 +40,7 @@ from .. import parts_of_speech
 from ..attrs import IOB_STRINGS
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
+from cython.operator cimport dereference as deref
 
 from cython.operator cimport dereference as deref
 

From 0a66d8b6c5ae185951319290f586059e876461e3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 30 Aug 2022 13:56:35 +0200
Subject: [PATCH 473/504] Make stable private modules public and adjust names
 (#11353)

* Make stable private modules public and adjust names

* `spacy.ml._character_embed` -> `spacy.ml.character_embed`
* `spacy.ml._precomputable_affine` -> `spacy.ml.precomputable_affine`
* `spacy.tokens._serialize` -> `spacy.tokens.doc_bin`
* `spacy.tokens._retokenize` -> `spacy.tokens.retokenize`
* `spacy.tokens._dict_proxies` -> `spacy.tokens.span_groups`

* Skip _precomputable_affine

* retokenize -> retokenizer

* Fix imports
---
 spacy/tokens/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index 7617e462fde..16c43485340 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -4,6 +4,7 @@
 from .morphanalysis import MorphAnalysis
 from .span import Span
 from .span_group import SpanGroup
-from .token import Token
+from .doc_bin import DocBin
+from .morphanalysis import MorphAnalysis
 
 __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]

From 29a897b9b74df91954f9ceeb57b59bfc20ae71c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 13 Sep 2022 09:51:12 +0200
Subject: [PATCH 474/504] Store activations in `Doc`s when `save_activations`
 is enabled (#11002)

* Store activations in Doc when `store_activations` is enabled

This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.

As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.

* Change type of `store_activations` to `Union[bool, List[str]]`

When the value is:

- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored

* Formatting fixes in Tagger

* Support store_activations in spancat and morphologizer

* Make Doc.activations type visible to MyPy

* textcat/textcat_multilabel: add store_activations option

* trainable_lemmatizer/entity_linker: add store_activations option

* parser/ner: do not currently support returning activations

* Extend tagger and senter tests

So that they, like the other tests, also check that we get no
activations if no activations were requested.

* Document `Doc.activations` and `store_activations` in the relevant pipes

* Start errors/warnings at higher numbers to avoid merge conflicts

Between the master and v4 branches.

* Add `store_activations` to docstrings.

* Replace store_activations setter by set_store_activations method

Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.

* Use dict comprehension suggested by @svlandeg

* Revert "Use dict comprehension suggested by @svlandeg"

This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.

* EntityLinker: add type annotations to _add_activations

* _store_activations: make kwarg-only, remove doc_scores_lens arg

* set_annotations: add type annotations

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* TextCat.predict: return dict

* Make the `TrainablePipe.store_activations` property a bool

This means that we can also bring back `store_activations` setter.

* Remove `TrainablePipe.activations`

We do not need to enumerate the activations anymore since `store_activations` is
`bool`.

* Add type annotations for activations in predict/set_annotations

* Rename `TrainablePipe.store_activations` to `save_activations`

* Error E1400 is not used anymore

This error was used when activations were still `Union[bool, List[str]]`.

* Change wording in API docs after store -> save change

* docs: tag (save_)activations as new in spaCy 4.0

* Fix copied line in morphologizer activations test

* Don't train in any test_save_activations test

* Rename activations

- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
  "guesses" -> "tree_ids".

* Remove unused W400 warning.

This warning was used when we still allowed the user to specify
which activations to save.

* Formatting fixes

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Replace "kb_ids" by a constant

* spancat: replace a cast by an assertion

* Fix EOF spacing

* Fix comments in test_save_activations tests

* Do not set RNG seed in activation saving tests

* Revert "spancat: replace a cast by an assertion"

This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/textcat.py | 4 ++++
 spacy/tokens/doc.pxd      | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 13841dd7bbb..79a98b9bc5f 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
+from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
+from thinc.types import Floats2d
+import numpy
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index fc0404f1423..bdd3098a7d1 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -52,6 +52,8 @@ cdef class Doc:
 
     cdef public dict activations
 
+    cdef public dict activations
+
     cdef public dict user_hooks
     cdef public dict user_token_hooks
     cdef public dict user_span_hooks

From c5f970a6970686c52e278ac392fffa713d8edb89 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 6 Oct 2022 10:51:06 +0200
Subject: [PATCH 475/504] `StringStore` refactoring (#11344)

* `strings`: Remove unused `hash32_utf8` function

* `strings`: Make `hash_utf8` and `decode_Utf8Str` private

* `strings`: Reorganize private functions

* 'strings': Raise error when non-string/-int types are passed to functions that don't accept them

* `strings`: Add `items()` method, add type hints, remove unused methods, restrict inputs to specific types, reorganize methods

* `Morphology`: Use `StringStore.items()` to enumerate features when pickling

* `test_stringstore`: Update pre-Python 3 tests

* Update `StringStore` docs

* Fix `get_string_id` imports

* Replace redundant test with tests for type checking

* Rename `_retrieve_interned_str`, remove `.get` default arg

* Add `get_string_id` to `strings.pyi`
Remove `mypy` ignore directives from imports of the above

* `strings.pyi`: Replace functions that consume `Union`-typed params with overloads

* `strings.pyi`: Revert some function signatures

* Update `SYMBOLS_BY_INT` lookups and error codes post-merge

* Revert clobbered change introduced in a previous merge

* Remove unnecessary type hint

* Invert tuple order in `StringStore.items()`

* Add test for `StringStore.items()`

* Revert "`Morphology`: Use `StringStore.items()` to enumerate features when pickling"

This reverts commit 1af9510ceb6b08cfdcfbf26df6896f26709fac0d.

* Rename `keys` and `key_map`

* Add `keys()` and `values()`

* Add comment about the inverted key-value semantics in the API

* Fix type hints

* Implement `keys()`, `values()`, `items()` without generators

* Fix type hints, remove unnecessary boxing

* Update docs

* Simplify `keys/values/items()` impl

* `mypy` fix

* Fix error message, doc fixes
---
 spacy/strings.pxd | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index c05731c9a15..688dbc46261 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,3 +1,6 @@
+from libc.stdint cimport int64_t, uint32_t
+from libcpp.vector cimport vector
+from libcpp.set cimport set
 from cymem.cymem cimport Pool
 from libc.stdint cimport int64_t, uint32_t
 from libcpp.set cimport set

From 99e5d3f82692ae307dbe6c6d107bedb5a6e3ea18 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 21 Oct 2022 18:01:18 +0900
Subject: [PATCH 476/504] Remove thinc util reimports (#11665)

* Remove imports marked as v2 leftovers

There are a few functions that were in `spacy.util` in v2, but were
moved to Thinc. In v3 these were imported in `spacy.util` so that code
could be used unchanged, but the comment over them indicates they should
always be imported from Thinc. This commit removes those imports.

It doesn't look like any DeprecationWarning was ever thrown for using
these, but it is probably fine to remove them anyway with a major
version. It is not clear that they were widely used.

* Import fix_random_seed correctly

This seems to be the only place in spaCy that was using the old import.
---
 spacy/util.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/util.py b/spacy/util.py
index ae9837e3afe..fdc02a717cc 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -59,6 +59,9 @@
     cupy = None
 
 
+from .symbols import ORTH
+from .compat import cupy, CudaStream, is_windows, importlib_metadata
+from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
 from . import about
 from .compat import CudaStream, cupy, is_windows
 from .errors import Errors, Warnings

From b01903ea3cbf5bc131ae3a746cf73fe30be39c41 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 24 Oct 2022 09:11:35 +0200
Subject: [PATCH 477/504] Replace EntityRuler with SpanRuler implementation
 (#11320)

* Replace EntityRuler with SpanRuler implementation

Remove `EntityRuler` and rename the `SpanRuler`-based
`future_entity_ruler` to `entity_ruler`.

Main changes:

* It is no longer possible to load patterns on init as with
`EntityRuler(patterns=)`.
* The older serialization formats (`patterns.jsonl`) are no longer
supported and the related tests are removed.
* The config settings are only stored in the config, not in the
serialized component (in particular the `phrase_matcher_attr` and
overwrite settings).

* Add migration guide to EntityRuler API docs

* docs update

* Minor edit

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/pipeline/span_ruler.py              | 8 ++++++++
 spacy/tests/pipeline/test_entity_ruler.py | 6 ++++++
 2 files changed, 14 insertions(+)

diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index cd8fea36b47..3f876598013 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -17,6 +17,14 @@
 
 import srsly
 
+from .pipe import Pipe
+from ..training import Example
+from ..language import Language
+from ..errors import Errors, Warnings
+from ..util import ensure_path, SimpleFrozenList, registry
+from ..tokens import Doc, Span
+from ..scorer import Scorer, get_ner_prf
+from ..matcher import Matcher, PhraseMatcher
 from .. import util
 from ..errors import Errors, Warnings
 from ..language import Language
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 12f2c9def2d..74731140688 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -2,6 +2,12 @@
 from thinc.api import NumpyOps, get_current_ops
 
 from spacy import registry
+from spacy.tokens import Doc, Span
+from spacy.language import Language
+from spacy.lang.en import English
+from spacy.pipeline import EntityRecognizer, merge_entities
+from spacy.pipeline import SpanRuler
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
 from spacy.lang.en import English
 from spacy.language import Language

From 9cab045bc040f7e29990f2c2e29b2226255eec05 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 8 Dec 2022 19:45:52 +0900
Subject: [PATCH 478/504] Remove old model shortcuts (#11916)

* Remove old model shortcuts

* Remove error, docs warnings about shortcuts

* Fix import in util

Accidentally deleted the whole import and not just the old part...

* Change universe example to v3 style

* Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6

* Update some model loading in Universe

* Add v2 tag to neuralcoref

* Use the spacy-version feature instead of a v2 tag

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/cli/download.py | 11 ++---------
 spacy/util.py         |  2 +-
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0635522930b..0b8ed54ed3c 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,15 +7,8 @@
 from wasabi import msg
 
 from .. import about
-from ..util import (
-    get_installed_models,
-    get_minor_version,
-    get_package_version,
-    is_package,
-    is_prerelease_version,
-    run_command,
-)
-from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
+from ..util import is_package, get_minor_version, run_command
+from ..util import is_prerelease_version
 
 
 @app.command(
diff --git a/spacy/util.py b/spacy/util.py
index fdc02a717cc..4f4718af5ff 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -61,7 +61,7 @@
 
 from .symbols import ORTH
 from .compat import cupy, CudaStream, is_windows, importlib_metadata
-from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
+from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, is_windows
 from .errors import Errors, Warnings

From 483c92d23cc2b0d1d4a87d03f453bf015c31f921 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 12 Dec 2022 08:55:53 +0100
Subject: [PATCH 479/504] Custom extensions for spans with equal boundaries
 (#11429)

* Init

* Fix return type for mypy

* adjust types and improve setting new attributes

* Add underscore changes to json conversion

* Add test and underscore changes to from_docs

* add underscore changes and test to span.to_doc

* update return values

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Add types to function

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* adjust formatting

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* shorten return type

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* add helper function to improve readability

* Improve code and add comments

* rerun azure tests

* Fix tests for json conversion

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tests/doc/test_underscore.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index c6bb5ad4e33..37094ac3e06 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -4,6 +4,10 @@
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
+# Helper functions
+def _get_tuple(s: Span):
+    return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
+
 
 # Helper functions
 def _get_tuple(s: Span):

From 480b4cfae25ca547334f4fc0b3def414ed445494 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Sat, 17 Dec 2022 14:32:19 +0100
Subject: [PATCH 480/504] Fix v4 branch to build against Thinc v9 (#11921)

* Move `thinc.extra.search` to `spacy.pipeline._parser_internals`

Backport of:
https://github.com/explosion/spaCy/pull/11317

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Replace references to `thinc.backends.linalg` with `CBlas`

Backport of:
https://github.com/explosion/spaCy/pull/11292

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Use cross entropy from `thinc.legacy`

* Require thinc>=9.0.0.dev0,<9.1.0

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 pyproject.toml                       |  5 ++---
 requirements.txt                     |  2 +-
 spacy/pipeline/morphologizer.pyx     |  2 +-
 spacy/pipeline/transition_parser.pyx | 21 +++++++++------------
 4 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index bfd7e68d1f7..77e1471426f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,9 +5,8 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.2.2,<8.3.0",
-    "numpy>=1.15.0; python_version < '3.9'",
-    "numpy>=1.25.0; python_version >= '3.9'",
+    "thinc>=9.0.0.dev0,<9.1.0",
+    "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index 94a9d17c0c3..699057643c9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.2.2,<8.3.0
+thinc>=9.0.0.dev0,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index cc8f87936b9..f822c38ac0e 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -302,7 +302,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 39cdc3c812a..025e0419c08 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -15,6 +15,9 @@ from libcpp.vector cimport vector
 
 import random
 
+import srsly
+from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
+import numpy.random
 import numpy
 import numpy.random
 import srsly
@@ -31,18 +34,12 @@ from thinc.api import (
 )
 from thinc.types import Floats2d
 
-from ..ml.parser_model cimport (
-    ActivationsC,
-    SizesC,
-    WeightsC,
-    alloc_activations,
-    arg_max_if_valid,
-    cpu_log_loss,
-    free_activations,
-    get_c_sizes,
-    get_c_weights,
-    predict_states,
-)
+from ._parser_internals.stateclass cimport StateClass
+from ._parser_internals.search cimport Beam
+from ..ml.parser_model cimport alloc_activations, free_activations
+from ..ml.parser_model cimport predict_states, arg_max_if_valid
+from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
+from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
 from ._parser_internals.stateclass cimport StateClass
 

From 28b7cb209748f3c6dd68b4d00dcfdb3b79b71129 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 29 Dec 2022 08:03:24 +0100
Subject: [PATCH 481/504] Adjust to new `Schedule` class and pass scores to
 `Optimizer` (#12008)

* Adjust to new `Schedule` class and pass scores to `Optimizer`

Requires https://github.com/explosion/thinc/pull/804

* Bump minimum Thinc requirement to 9.0.0.dev1
---
 pyproject.toml   |  2 +-
 requirements.txt |  2 +-
 spacy/util.py    | 13 +++++++++----
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 77e1471426f..58e4382e829 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev0,<9.1.0",
+    "thinc>=9.0.0.dev1,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 699057643c9..5c889c91f81 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev0,<9.1.0
+thinc>=9.0.0.dev1,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/spacy/util.py b/spacy/util.py
index 4f4718af5ff..a76e8f73eeb 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -2,7 +2,12 @@
 import importlib
 import importlib.metadata
 import importlib.util
-import inspect
+import re
+from pathlib import Path
+import thinc
+from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
+from thinc.api import ConfigValidationError, Model, constant as constant_schedule
+import functools
 import itertools
 import logging
 import os
@@ -1617,12 +1622,12 @@ def minibatch(items, size):
     so that batch-size can vary on each step.
     """
     if isinstance(size, int):
-        size_ = itertools.repeat(size)
+        size_ = constant_schedule(size)
     else:
         size_ = iter(size)
     items = iter(items)
-    while True:
-        batch_size = next(size_)
+    for step in itertools.count():
+        batch_size = size_(step)
         batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break

From fae4979f9d05782b342695a4aedc10491d230113 Mon Sep 17 00:00:00 2001
From: Tetsuo Kiso <tetsuok@users.noreply.github.com>
Date: Wed, 4 Jan 2023 01:43:09 +0900
Subject: [PATCH 482/504] Delete unused imports for StringStore (#12040)

---
 spacy/tokenizer.pxd | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index b2e50969462..2610532b75d 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -2,6 +2,10 @@ from cymem.cymem cimport Pool
 from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 
+from .typedefs cimport hash_t
+from .structs cimport LexemeC, SpanC, TokenC
+from .tokens.doc cimport Doc
+from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
 from .structs cimport LexemeC, SpanC, TokenC
 from .tokens.doc cimport Doc

From 2c1f25072cf2b3cd7c20397d86a2a19715044e26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 16 Jan 2023 10:25:53 +0100
Subject: [PATCH 483/504] Add
 `TrainablePipe.{distill,get_teacher_student_loss}` (#12016)

* Add `TrainablePipe.{distill,get_teacher_student_loss}`

This change adds two methods:

- `TrainablePipe::distill` which performs a training step of a
   student pipe on a teacher pipe, giving a batch of `Doc`s.
- `TrainablePipe::get_teacher_student_loss` computes the loss
  of a student relative to the teacher.

The `distill` or `get_teacher_student_loss` methods are also implemented
in the tagger, edit tree lemmatizer, and parser pipes, to enable
distillation in those pipes and as an example for other pipes.

* Fix stray `Beam` import

* Fix incorrect import

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* TrainablePipe.distill: use `Iterable[Example]`

* Add Pipe.is_distillable method

* Add `validate_distillation_examples`

This first calls `validate_examples` and then checks that the
student/teacher tokens are the same.

* Update distill documentation

* Add distill documentation for all pipes that support distillation

* Fix incorrect identifier

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Add comment to explain `is_distillable`

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/training/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index adfc2bb6658..9445d0b63a5 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,3 +1,6 @@
+from .corpus import Corpus, JsonlCorpus  # noqa: F401
+from .example import Example, validate_examples, validate_get_examples  # noqa: F401
+from .example import validate_distillation_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401

From 283ef2b9a822c123a037dd742d23737040b1e0cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 18 Jan 2023 11:27:45 +0100
Subject: [PATCH 484/504] Merge the parser refactor into `v4` (#10940)

* Try to fix doc.copy

* Set dev version

* Make vocab always own lexemes

* Change version

* Add SpanGroups.copy method

* Fix set_annotations during Parser.update

* Fix dict proxy copy

* Upd version

* Fix copying SpanGroups

* Fix set_annotations in parser.update

* Fix parser set_annotations during update

* Revert "Fix parser set_annotations during update"

This reverts commit eb138c89edb306608826dca50619ea8a60de2b14.

* Revert "Fix set_annotations in parser.update"

This reverts commit c6df0eafd0046179c1c9fb7840074edf04e4721d.

* Fix set_annotations during parser update

* Inc version

* Handle final states in get_oracle_sequence

* Inc version

* Try to fix parser training

* Inc version

* Fix

* Inc version

* Fix parser oracle

* Inc version

* Inc version

* Fix transition has_gold

* Inc version

* Try to use real histories, not oracle

* Inc version

* Upd parser

* Inc version

* WIP on rewrite parser

* WIP refactor parser

* New progress on parser model refactor

* Prepare to remove parser_model.pyx

* Convert parser from cdef class

* Delete spacy.ml.parser_model

* Delete _precomputable_affine module

* Wire up tb_framework to new parser model

* Wire up parser model

* Uncython ner.pyx and dep_parser.pyx

* Uncython

* Work on parser model

* Support unseen_classes in parser model

* Support unseen classes in parser

* Cleaner handling of unseen classes

* Work through tests

* Keep working through errors

* Keep working through errors

* Work on parser. 15 tests failing

* Xfail beam stuff. 9 failures

* More xfail. 7 failures

* Xfail. 6 failures

* cleanup

* formatting

* fixes

* pass nO through

* Fix empty doc in update

* Hackishly fix resizing. 3 failures

* Fix redundant test. 2 failures

* Add reference version

* black formatting

* Get tests passing with reference implementation

* Fix missing prints

* Add missing file

* Improve indexing on reference implementation

* Get non-reference forward func working

* Start rigging beam back up

* removing redundant tests, cf #8106

* black formatting

* temporarily xfailing issue 4314

* make flake8 happy again

* mypy fixes

* ensure labels are added upon predict

* cleanup remnants from merge conflicts

* Improve unseen label masking

Two changes to speed up masking by ~10%:

- Use a bool array rather than an array of float32.

- Let the mask indicate whether a label was seen, rather than
  unseen. The mask is most frequently used to index scores for
  seen labels. However, since the mask marked unseen labels,
  this required computing an intermittent flipped mask.

* Write moves costs directly into numpy array (#10163)

This avoids elementwise indexing and the allocation of an additional
array.

Gives a ~15% speed improvement when using batch_by_sequence with size
32.

* Temporarily disable ner and rehearse tests

Until rehearse is implemented again in the refactored parser.

* Fix loss serialization issue (#10600)

* Fix loss serialization issue

Serialization of a model fails with:

TypeError: array(738.3855, dtype=float32) is not JSON serializable

Fix this using float conversion.

* Disable CI steps that require spacy.TransitionBasedParser.v2

After finishing the refactor, TransitionBasedParser.v2 should be
provided for backwards compat.

* Add back support for beam parsing to the refactored parser (#10633)

* Add back support for beam parsing

Beam parsing was already implemented as part of the `BeamBatch` class.
This change makes its counterpart `GreedyBatch`. Both classes are hooked
up in `TransitionModel`, selecting `GreedyBatch` when the beam size is
one, or `BeamBatch` otherwise.

* Use kwarg for beam width

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Avoid implicit default for beam_width and beam_density

* Parser.{beam,greedy}_parse: ensure labels are added

* Remove 'deprecated' comments

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Parser `StateC` optimizations (#10746)

* `StateC`: Optimizations

Avoid GIL acquisition in `__init__`
Increase default buffer capacities on init
Reduce C++ exception overhead

* Fix typo

* Replace `set::count` with `set::find`

* Add exception attribute to c'tor

* Remove unused import

* Use a power-of-two value for initial capacity
Use default-insert to init `_heads` and `_unshiftable`

* Merge `cdef` variable declarations and assignments

* Vectorize `example.get_aligned_parses` (#10789)

* `example`: Vectorize `get_aligned_parse`
Rename `numpy` import

* Convert aligned array to lists before returning

* Revert import renaming

* Elide slice arguments when selecting the entire range

* Tagger/morphologizer alignment performance optimizations (#10798)

* `example`: Unwrap `numpy` scalar arrays before passing them to `StringStore.__getitem__`

* `AlignmentArray`: Use native list as staging buffer for offset calculation

* `example`: Vectorize `get_aligned`

* Hoist inner functions out of `get_aligned`

* Replace inline `if..else` clause in assignment statement

* `AlignmentArray`: Use raw indexing into offset and data `numpy` arrays

* `example`: Replace array unique value check with `groupby`

* `example`: Correctly exclude tokens with no alignment in `_get_aligned_vectorized`
Simplify `_get_aligned_non_vectorized`

* `util`: Update `all_equal` docstring

* Explicitly use `int32_t*`

* Restore C CPU inference in the refactored parser (#10747)

* Bring back the C parsing model

The C parsing model is used for CPU inference and is still faster for
CPU inference than the forward pass of the Thinc model.

* Use C sgemm provided by the Ops implementation

* Make tb_framework module Cython, merge in C forward implementation

* TransitionModel: raise in backprop returned from forward_cpu

* Re-enable greedy parse test

* Return transition scores when forward_cpu is used

* Apply suggestions from code review

Import `Model` from `thinc.api`

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Use relative imports in tb_framework

* Don't assume a default for beam_width

* We don't have a direct dependency on BLIS anymore

* Rename forwards to _forward_{fallback,greedy_cpu}

* Require thinc >=8.1.0,<8.2.0

* tb_framework: clean up imports

* Fix return type of _get_seen_mask

* Move up _forward_greedy_cpu

* Style fixes.

* Lower thinc lowerbound to 8.1.0.dev0

* Formatting fix

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Reimplement parser rehearsal function (#10878)

* Reimplement parser rehearsal function

Before the parser refactor, rehearsal was driven by a loop in the
`rehearse` method itself. For each parsing step, the loops would:

1. Get the predictions of the teacher.
2. Get the predictions and backprop function of the student.
3. Compute the loss and backprop into the student.
4. Move the teacher and student forward with the predictions of
   the student.

In the refactored parser, we cannot perform search stepwise rehearsal
anymore, since the model now predicts all parsing steps at once.
Therefore, rehearsal is performed in the following steps:

1. Get the predictions of all parsing steps from the student, along
   with its backprop function.
2. Get the predictions from the teacher, but use the predictions of
   the student to advance the parser while doing so.
3. Compute the loss and backprop into the student.

To support the second step a new method, `advance_with_actions` is
added to `GreedyBatch`, which performs the provided parsing steps.

* tb_framework: wrap upper_W and upper_b in Linear

Thinc's Optimizer cannot handle resizing of existing parameters. Until
it does, we work around this by wrapping the weights/biases of the upper
layer of the parser model in Linear. When the upper layer is resized, we
copy over the existing parameters into a new Linear instance. This does
not trigger an error in Optimizer, because it sees the resized layer as
a new set of parameters.

* Add test for TransitionSystem.apply_actions

* Better FIXME marker

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Fixes from Madeesh

* Apply suggestions from Sofie

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remove useless assignment

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename some identifiers in the parser refactor (#10935)

* Rename _parseC to _parse_batch

* tb_framework: prefix many auxiliary functions with underscore

To clearly state the intent that they are private.

* Rename `lower` to `hidden`, `upper` to `output`

* Parser slow test fixup

We don't have TransitionBasedParser.{v1,v2} until we bring it back as a
legacy option.

* Remove last vestiges of PrecomputableAffine

This does not exist anymore as a separate layer.

* ner: re-enable sentence boundary checks

* Re-enable test that works now.

* test_ner: make loss test more strict again

* Remove commented line

* Re-enable some more beam parser tests

* Remove unused _forward_reference function

* Update for CBlas changes in Thinc 8.1.0.dev2

Bump thinc dependency to 8.1.0.dev3.

* Remove references to spacy.TransitionBasedParser.{v1,v2}

Since they will not be offered starting with spaCy v4.

* `tb_framework`: Replace references to `thinc.backends.linalg` with `CBlas`

* dont use get_array_module (#11056) (#11293)

Co-authored-by: kadarakos <kadar.akos@gmail.com>

* Move `thinc.extra.search` to `spacy.pipeline._parser_internals` (#11317)

* `search`: Move from `thinc.extra.search`
Fix NPE in `Beam.__dealloc__`

* `pytest`: Add support for executing Cython tests
Move `search` tests from thinc and patch them to run with `pytest`

* `mypy` fix

* Update comment

* `conftest`: Expose `register_cython_tests`

* Remove unused import

* Move `argmax` impls to new `_parser_utils` Cython module (#11410)

* Parser does not have to be a cdef class anymore

This also fixes validation of the initialization schema.

* Add back spacy.TransitionBasedParser.v2

* Fix a rename that was missed in #10878.

So that rehearsal tests pass.

* Remove module from setup.py that got added during the merge

* Bring back support for `update_with_oracle_cut_size` (#12086)

* Bring back support for `update_with_oracle_cut_size`

This option was available in the pre-refactor parser, but was never
implemented in the refactored parser. This option cuts transition
sequences that are longer than `update_with_oracle_cut` size into
separate sequences that have at most `update_with_oracle_cut`
transitions. The oracle (gold standard) transition sequence is used to
determine the cuts and the initial states for the additional sequences.

Applying this cut makes the batches more homogeneous in the transition
sequence lengths, making forward passes (and as a consequence training)
much faster.

Training time 1000 steps on de_core_news_lg:

- Before this change: 149s
- After this change: 68s
- Pre-refactor parser: 81s

* Fix a rename that was missed in #10878.

So that rehearsal tests pass.

* Apply suggestions from @shadeMe

* Use chained conditional

* Test with update_with_oracle_cut_size={0, 1, 5, 100}

And fix a git that occurs with a cut size of 1.

* Fix up some merge fall out

* Update parser distillation for the refactor

In the old parser, we'd iterate over the transitions in the distill
function and compute the loss/gradients on the go. In the refactored
parser, we first let the student model parse the inputs. Then we'll let
the teacher compute the transition probabilities of the states in the
student's transition sequence. We can then compute the gradients of the
student given the teacher.

* Add back spacy.TransitionBasedParser.v1 references

- Accordion in the architecture docs.
- Test in test_parse, but disabled until we have a spacy-legacy release.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
Co-authored-by: svlandeg <svlandeg@github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: kadarakos <kadar.akos@gmail.com>
---
 setup.py                                      |   6 +-
 spacy/cli/templates/quickstart_training.jinja |  12 +-
 spacy/ml/_precomputable_affine.py             | 164 -----
 spacy/ml/tb_framework.pxd                     |  28 +
 spacy/ml/tb_framework.py                      |  51 --
 spacy/ml/tb_framework.pyx                     | 621 ++++++++++++++++++
 .../_parser_internals/_parser_utils.pxd       |   2 +
 .../_parser_internals/_parser_utils.pyx       |  22 +
 spacy/pipeline/_parser_internals/_state.pxd   |  59 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 +
 spacy/pipeline/_parser_internals/batch.pxd    |   2 +
 spacy/pipeline/_parser_internals/batch.pyx    |  52 ++
 spacy/pipeline/_parser_internals/ner.pyx      |   2 +
 .../pipeline/_parser_internals/stateclass.pyx |   7 +
 .../_parser_internals/transition_system.pxd   |   7 +
 .../_parser_internals/transition_system.pyx   |  70 ++
 .../{dep_parser.pyx => dep_parser.py}         |  17 +-
 spacy/pipeline/{ner.pyx => ner.py}            |  33 +-
 spacy/tests/parser/test_ner.py                |  11 +-
 spacy/tests/parser/test_parse.py              |  80 ++-
 spacy/tests/pipeline/test_tok2vec.py          |   2 +-
 .../tests/serialize/test_serialize_config.py  |  56 +-
 spacy/tests/test_misc.py                      |  53 +-
 website/docs/api/architectures.mdx            |  26 +-
 website/docs/api/legacy.mdx                   |   2 +-
 25 files changed, 1006 insertions(+), 382 deletions(-)
 delete mode 100644 spacy/ml/_precomputable_affine.py
 create mode 100644 spacy/ml/tb_framework.pxd
 delete mode 100644 spacy/ml/tb_framework.py
 create mode 100644 spacy/ml/tb_framework.pyx
 create mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pxd
 create mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pyx
 create mode 100644 spacy/pipeline/_parser_internals/batch.pxd
 create mode 100644 spacy/pipeline/_parser_internals/batch.pyx
 rename spacy/pipeline/{dep_parser.pyx => dep_parser.py} (97%)
 rename spacy/pipeline/{ner.pyx => ner.py} (93%)

diff --git a/setup.py b/setup.py
index 0eb529c2098..a4a87d68aec 100755
--- a/setup.py
+++ b/setup.py
@@ -32,12 +32,10 @@
     "spacy.kb.candidate",
     "spacy.kb.kb",
     "spacy.kb.kb_in_memory",
-    "spacy.ml.parser_model",
+    "spacy.ml.tb_framework",
     "spacy.morphology",
-    "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
-    "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
     "spacy.pipeline.sentencizer",
@@ -45,6 +43,7 @@
     "spacy.pipeline.tagger",
     "spacy.pipeline.transition_parser",
     "spacy.pipeline._parser_internals.arc_eager",
+    "spacy.pipeline._parser_internals.batch",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
     "spacy.pipeline._parser_internals.search",
@@ -52,6 +51,7 @@
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.pipeline._parser_internals._beam_utils",
+    "spacy.pipeline._parser_internals._parser_utils",
     "spacy.tokenizer",
     "spacy.training.align",
     "spacy.training.gold_io",
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 2817147f3e9..36325711d4d 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -90,12 +90,11 @@ grad_factor = 1.0
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = false
 nO = null
 
 [components.parser.model.tok2vec]
@@ -111,12 +110,11 @@ grad_factor = 1.0
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = false
 nO = null
 
 [components.ner.model.tok2vec]
@@ -387,12 +385,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = true
 nO = null
 
 [components.parser.model.tok2vec]
@@ -405,12 +402,11 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
deleted file mode 100644
index 1c20c622b2c..00000000000
--- a/spacy/ml/_precomputable_affine.py
+++ /dev/null
@@ -1,164 +0,0 @@
-from thinc.api import Model, normal_init
-
-from ..util import registry
-
-
-@registry.layers("spacy.PrecomputableAffine.v1")
-def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
-    model = Model(
-        "precomputable_affine",
-        forward,
-        init=init,
-        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
-        params={"W": None, "b": None, "pad": None},
-        attrs={"dropout_rate": dropout},
-    )
-    return model
-
-
-def forward(model, X, is_train):
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.get_param("W")
-    # Preallocate array for layer output, including padding.
-    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
-    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
-    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
-
-    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
-    # change its shape to (nF, nO, nP) without breaking existing models. So
-    # we'll squeeze the first dimension here.
-    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
-
-    def backward(dY_ids):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nO, nP), and get back:
-        # (nB, nO, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nO, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        assert dY.ndim == 3
-        assert dY.shape[1] == nO, dY.shape
-        assert dY.shape[2] == nP, dY.shape
-        # nB = dY.shape[0]
-        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
-        Xf = X[ids]
-        Xf = Xf.reshape((Xf.shape[0], nF * nI))
-
-        model.inc_grad("b", dY.sum(axis=0))
-        dY = dY.reshape((dY.shape[0], nO * nP))
-
-        Wopfi = W.transpose((1, 2, 0, 3))
-        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
-        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
-
-        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
-        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
-        # (o, p, f, i) --> (f, o, p, i)
-        dWopfi = dWopfi.transpose((2, 0, 1, 3))
-        model.inc_grad("W", dWopfi)
-        return dXf.reshape((dXf.shape[0], nF, nI))
-
-    return Yf, backward
-
-
-def _backprop_precomputable_affine_padding(model, dY, ids):
-    nB = dY.shape[0]
-    nF = model.get_dim("nF")
-    nP = model.get_dim("nP")
-    nO = model.get_dim("nO")
-    # Backprop the "padding", used as a filler for missing values.
-    # Values that are missing are set to -1, and each state vector could
-    # have multiple missing values. The padding has different values for
-    # different missing features. The gradient of the padding vector is:
-    #
-    # for b in range(nB):
-    #     for f in range(nF):
-    #         if ids[b, f] < 0:
-    #             d_pad[f] += dY[b]
-    #
-    # Which can be rewritten as:
-    #
-    # (ids < 0).T @ dY
-    mask = model.ops.asarray(ids < 0, dtype="f")
-    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
-    return d_pad.reshape((1, nF, nO, nP))
-
-
-def init(model, X=None, Y=None):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    if model.has_param("W") and model.get_param("W").any():
-        return
-
-    nF = model.get_dim("nF")
-    nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nO, nP, nI)
-    b = model.ops.alloc2f(nO, nP)
-    pad = model.ops.alloc4f(1, nF, nO, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
-        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors = vectors.reshape((vectors.shape[0], nO, nP))
-        vectors += b
-        vectors = model.ops.asarray(vectors)
-        if nP >= 2:
-            return model.ops.maxout(vectors)[0]
-        else:
-            return vectors * (vectors >= 0)
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = model.get_param("W").copy()
-    b = model.get_param("b").copy()
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("b", b)
-        else:
-            break
diff --git a/spacy/ml/tb_framework.pxd b/spacy/ml/tb_framework.pxd
new file mode 100644
index 00000000000..965508519e8
--- /dev/null
+++ b/spacy/ml/tb_framework.pxd
@@ -0,0 +1,28 @@
+from libc.stdint cimport int8_t
+
+
+cdef struct SizesC:
+    int states
+    int classes
+    int hiddens
+    int pieces
+    int feats
+    int embed_width
+    int tokens
+
+
+cdef struct WeightsC:
+    const float* feat_weights
+    const float* feat_bias
+    const float* hidden_bias
+    const float* hidden_weights
+    const int8_t* seen_mask
+
+
+cdef struct ActivationsC:
+    int* token_ids
+    float* unmaxed
+    float* hiddens
+    int* is_valid
+    int _curr_size
+    int _max_size
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
deleted file mode 100644
index e351ad4e570..00000000000
--- a/spacy/ml/tb_framework.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from thinc.api import Model, noop
-
-from ..util import registry
-from .parser_model import ParserStepModel
-
-
-@registry.layers("spacy.TransitionModel.v1")
-def TransitionModel(
-    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
-):
-    """Set up a stepwise transition-based model"""
-    if upper is None:
-        has_upper = False
-        upper = noop()
-    else:
-        has_upper = True
-    # don't define nO for this object, because we can't dynamically change it
-    return Model(
-        name="parser_model",
-        forward=forward,
-        dims={"nI": tok2vec.maybe_get_dim("nI")},
-        layers=[tok2vec, lower, upper],
-        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
-        init=init,
-        attrs={
-            "has_upper": has_upper,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def forward(model, X, is_train):
-    step_model = ParserStepModel(
-        X,
-        model.layers,
-        unseen_classes=model.attrs["unseen_classes"],
-        train=is_train,
-        has_upper=model.attrs["has_upper"],
-    )
-
-    return step_model, step_model.finish_steps
-
-
-def init(model, X=None, Y=None):
-    model.get_ref("tok2vec").initialize(X=X)
-    lower = model.get_ref("lower")
-    lower.initialize()
-    if model.attrs["has_upper"]:
-        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
-        model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
new file mode 100644
index 00000000000..79be13b00bd
--- /dev/null
+++ b/spacy/ml/tb_framework.pyx
@@ -0,0 +1,621 @@
+# cython: infer_types=True, cdivision=True, boundscheck=False
+from typing import List, Tuple, Any, Optional, TypeVar, cast
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from libcpp.vector cimport vector
+import numpy
+cimport numpy as np
+from thinc.api import Model, normal_init, chain, list2array, Linear
+from thinc.api import uniform_init, glorot_uniform_init, zero_init
+from thinc.api import NumpyOps
+from thinc.backends.cblas cimport CBlas, saxpy, sgemm
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
+from thinc.types import Ints1d, Ints2d
+
+from ..errors import Errors
+from ..pipeline._parser_internals import _beam_utils
+from ..pipeline._parser_internals.batch import GreedyBatch
+from ..pipeline._parser_internals._parser_utils cimport arg_max
+from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
+from ..pipeline._parser_internals.transition_system cimport TransitionSystem
+from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..tokens.doc import Doc
+from ..util import registry
+
+
+State = Any  # TODO
+
+
+@registry.layers("spacy.TransitionModel.v2")
+def TransitionModel(
+    *,
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    beam_width: int = 1,
+    beam_density: float = 0.0,
+    state_tokens: int,
+    hidden_width: int,
+    maxout_pieces: int,
+    nO: Optional[int] = None,
+    unseen_classes=set(),
+) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
+    """Set up a transition-based parsing model, using a maxout hidden
+    layer and a linear output layer.
+    """
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
+    tok2vec_projected.set_dim("nO", hidden_width)
+
+    # FIXME: we use `output` as a container for the output layer's
+    # weights and biases. Thinc optimizers cannot handle resizing
+    # of parameters. So, when the parser model is resized, we
+    # construct a new `output` layer, which has a different key in
+    # the optimizer. Once the optimizer supports parameter resizing,
+    # we can replace the `output` layer by `output_W` and `output_b`
+    # parameters in this model.
+    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
+
+    return Model(
+        name="parser_model",
+        forward=forward,
+        init=init,
+        layers=[tok2vec_projected, output],
+        refs={
+            "tok2vec": tok2vec_projected,
+            "output": output,
+        },
+        params={
+            "hidden_W": None,  # Floats2d W for the hidden layer
+            "hidden_b": None,  # Floats1d bias for the hidden layer
+            "hidden_pad": None,  # Floats1d padding for the hidden layer
+        },
+        dims={
+            "nO": None,  # Output size
+            "nP": maxout_pieces,
+            "nH": hidden_width,
+            "nI": tok2vec_projected.maybe_get_dim("nO"),
+            "nF": state_tokens,
+        },
+        attrs={
+            "beam_width": beam_width,
+            "beam_density": beam_density,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def resize_output(model: Model, new_nO: int) -> Model:
+    old_nO = model.maybe_get_dim("nO")
+    output = model.get_ref("output")
+    if old_nO is None:
+        model.set_dim("nO", new_nO)
+        output.set_dim("nO", new_nO)
+        output.initialize()
+        return model
+    elif new_nO <= old_nO:
+        return model
+    elif output.has_param("W"):
+        nH = model.get_dim("nH")
+        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
+        new_output.initialize()
+        new_W = new_output.get_param("W")
+        new_b = new_output.get_param("b")
+        old_W = output.get_param("W")
+        old_b = output.get_param("b")
+        new_W[:old_nO] = old_W  # type: ignore
+        new_b[:old_nO] = old_b  # type: ignore
+        for i in range(old_nO, new_nO):
+            model.attrs["unseen_classes"].add(i)
+        model.layers[-1] = new_output
+        model.set_ref("output", new_output)
+    # TODO: Avoid this private intrusion
+    model._dims["nO"] = new_nO
+    return model
+
+
+def init(
+    model,
+    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
+    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
+):
+    if X is not None:
+        docs, moves = X
+        model.get_ref("tok2vec").initialize(X=docs)
+    else:
+        model.get_ref("tok2vec").initialize()
+    inferred_nO = _infer_nO(Y)
+    if inferred_nO is not None:
+        current_nO = model.maybe_get_dim("nO")
+        if current_nO is None or current_nO != inferred_nO:
+            model.attrs["resize_output"](model, inferred_nO)
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nH = model.get_dim("nH")
+    nI = model.get_dim("nI")
+    nF = model.get_dim("nF")
+    ops = model.ops
+
+    Wl = ops.alloc2f(nH * nP, nF * nI)
+    bl = ops.alloc1f(nH * nP)
+    padl = ops.alloc1f(nI)
+    # Wl = zero_init(ops, Wl.shape)
+    Wl = glorot_uniform_init(ops, Wl.shape)
+    padl = uniform_init(ops, padl.shape)  # type: ignore
+    # TODO: Experiment with whether better to initialize output_W
+    model.set_param("hidden_W", Wl)
+    model.set_param("hidden_b", bl)
+    model.set_param("hidden_pad", padl)
+    # model = _lsuv_init(model)
+    return model
+
+
+class TransitionModelInputs:
+    """
+    Input to transition model.
+    """
+
+    # dataclass annotation is not yet supported in Cython 0.29.x,
+    # so, we'll do something close to it.
+
+    actions: Optional[List[Ints1d]]
+    docs: List[Doc]
+    max_moves: int
+    moves: TransitionSystem
+    states: Optional[List[State]]
+
+    __slots__ = [
+        "actions",
+        "docs",
+        "max_moves",
+        "moves",
+        "states",
+    ]
+
+    def __init__(
+        self,
+        docs: List[Doc],
+        moves: TransitionSystem,
+        actions: Optional[List[Ints1d]]=None,
+        max_moves: int=0,
+        states: Optional[List[State]]=None):
+        """
+        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
+        docs (List[Doc]): Docs to predict transition sequences for.
+        max_moves: (int): the maximum number of moves to apply, values less
+            than 1 will apply moves to states until they are final states.
+        moves (TransitionSystem): the transition system to use when predicting
+            the transition sequences.
+        states (Optional[List[States]]): the initial states to predict the
+            transition sequences for. When absent, the initial states are
+            initialized from the provided Docs.
+        """
+        self.actions = actions
+        self.docs = docs
+        self.moves = moves
+        self.max_moves = max_moves
+        self.states = states
+
+
+def forward(model, inputs: TransitionModelInputs, is_train: bool):
+    docs = inputs.docs
+    moves = inputs.moves
+    actions = inputs.actions
+
+    beam_width = model.attrs["beam_width"]
+    hidden_pad = model.get_param("hidden_pad")
+    tok2vec = model.get_ref("tok2vec")
+
+    states = moves.init_batch(docs) if inputs.states is None else inputs.states
+    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
+    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
+    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
+    seen_mask = _get_seen_mask(model)
+
+    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
+        # Note: max_moves is only used during training, so we don't need to
+        #       pass it to the greedy inference path.
+        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
+    else:
+        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
+            feats, backprop_feats, seen_mask, is_train, actions=actions,
+            max_moves=inputs.max_moves)
+
+
+def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
+                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+    cdef vector[StateC*] c_states
+    cdef StateClass state
+    for state in states:
+        if not state.is_final():
+            c_states.push_back(state.c)
+    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
+    # Precomputed features have rows for each token, plus one for padding.
+    cdef int n_tokens = feats.shape[0] - 1
+    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
+    cdef CBlas cblas = model.ops.cblas()
+    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
+
+    def backprop(dY):
+        raise ValueError(Errors.E4004)
+
+    return (states, scores), backprop
+
+cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
+                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
+    cdef int i, j
+    cdef vector[StateC *] unfinished
+    cdef ActivationsC activations = _alloc_activations(sizes)
+    cdef np.ndarray step_scores
+    cdef np.ndarray step_actions
+
+    scores = []
+    while sizes.states >= 1:
+        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
+        step_actions = actions[0] if actions is not None else None
+        with nogil:
+            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
+            if actions is None:
+                # Validate actions, argmax, take action.
+                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
+                    sizes.states)
+            else:
+                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
+            for i in range(sizes.states):
+                if not states[i].is_final():
+                    unfinished.push_back(states[i])
+            for i in range(unfinished.size()):
+                states[i] = unfinished[i]
+        sizes.states = unfinished.size()
+        scores.append(step_scores)
+        unfinished.clear()
+        actions = actions[1:] if actions is not None else None
+    _free_activations(&activations)
+
+    return scores
+
+
+def _forward_fallback(
+    model: Model,
+    moves: TransitionSystem,
+    states: List[StateClass],
+    tokvecs, backprop_tok2vec,
+    feats,
+    backprop_feats,
+    seen_mask,
+    is_train: bool,
+    actions: Optional[List[Ints1d]]=None,
+    max_moves: int=0):
+    nF = model.get_dim("nF")
+    output = model.get_ref("output")
+    hidden_b = model.get_param("hidden_b")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+
+    beam_width = model.attrs["beam_width"]
+    beam_density = model.attrs["beam_density"]
+
+    ops = model.ops
+
+    all_ids = []
+    all_which = []
+    all_statevecs = []
+    all_scores = []
+    if beam_width == 1:
+        batch = GreedyBatch(moves, states, None)
+    else:
+        batch = _beam_utils.BeamBatch(
+            moves, states, None, width=beam_width, density=beam_density
+        )
+    arange = ops.xp.arange(nF)
+    n_moves = 0
+    while not batch.is_done:
+        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
+        for i, state in enumerate(batch.get_unfinished_states()):
+            state.set_context_tokens(ids, i, nF)
+        # Sum the state features, add the bias and apply the activation (maxout)
+        # to create the state vectors.
+        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
+        preacts2f += hidden_b
+        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
+        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
+        statevecs, which = ops.maxout(preacts)
+        # We don't use output's backprop, since we want to backprop for
+        # all states at once, rather than a single state.
+        scores = output.predict(statevecs)
+        scores[:, seen_mask] = ops.xp.nanmin(scores)
+        # Transition the states, filtering out any that are finished.
+        cpu_scores = ops.to_numpy(scores)
+        if actions is None:
+            batch.advance(cpu_scores)
+        else:
+            batch.advance_with_actions(actions[0])
+            actions = actions[1:]
+        all_scores.append(scores)
+        if is_train:
+            # Remember intermediate results for the backprop.
+            all_ids.append(ids)
+            all_statevecs.append(statevecs)
+            all_which.append(which)
+        if n_moves >= max_moves >= 1:
+            break
+        n_moves += 1
+
+    def backprop_parser(d_states_d_scores):
+        ids = ops.xp.vstack(all_ids)
+        which = ops.xp.vstack(all_which)
+        statevecs = ops.xp.vstack(all_statevecs)
+        _, d_scores = d_states_d_scores
+        if model.attrs.get("unseen_classes"):
+            # If we have a negative gradient (i.e. the probability should
+            # increase) on any classes we filtered out as unseen, mark
+            # them as seen.
+            for clas in set(model.attrs["unseen_classes"]):
+                if (d_scores[:, clas] < 0).any():
+                    model.attrs["unseen_classes"].remove(clas)
+        d_scores *= seen_mask == False
+        # Calculate the gradients for the parameters of the output layer.
+        # The weight gemm is (nS, nO) @ (nS, nH).T
+        output.inc_grad("b", d_scores.sum(axis=0))
+        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
+        # Now calculate d_statevecs, by backproping through the output linear layer.
+        # This gemm is (nS, nO) @ (nO, nH)
+        output_W = output.get_param("W")
+        d_statevecs = ops.gemm(d_scores, output_W)
+        # Backprop through the maxout activation
+        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
+        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
+        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
+        # We don't need to backprop the summation, because we pass back the IDs instead
+        d_state_features = backprop_feats((d_preacts2f, ids))
+        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
+        ops.scatter_add(d_tokvecs, ids, d_state_features)
+        model.inc_grad("hidden_pad", d_tokvecs[-1])
+        return (backprop_tok2vec(d_tokvecs[:-1]), None)
+
+    return (list(batch), all_scores), backprop_parser
+
+
+def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
+    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
+    for class_ in model.attrs.get("unseen_classes", set()):
+        mask[class_] = True
+    return mask
+
+
+def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
+    W: Floats2d = model.get_param("hidden_W")
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
+    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
+    W3f = W3f.transpose((1, 0, 2))
+    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
+    assert X.shape == (X.shape[0], nI), X.shape
+    Yf_ = model.ops.gemm(X, W2f, trans2=True)
+    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
+
+    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nH, nP), and get back:
+        # (nB, nH, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nH, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        dXf = model.ops.gemm(dY, W)
+        Xf = X[ids].reshape((ids.shape[0], -1))
+        dW = model.ops.gemm(dY, Xf, trans1=True)
+        model.inc_grad("hidden_W", dW)
+        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
+
+    return Yf, backward
+
+
+def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
+    if Y is None:
+        return None
+    _, scores = Y
+    if len(scores) == 0:
+        return None
+    assert scores[0].shape[0] >= 1
+    assert len(scores[0].shape) == 2
+    return scores[0].shape[1]
+
+
+def _lsuv_init(model: Model):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    W = model.maybe_get_param("hidden_W")
+    if W is not None and W.any():
+        return
+
+    nF = model.get_dim("nF")
+    nH = model.get_dim("nH")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nH, nP, nI)
+    b = model.ops.alloc2f(nH, nP)
+    pad = model.ops.alloc4f(1, nF, nH, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc_f((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc_f((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
+        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
+        vectors3f += b
+        return model.ops.maxout(vectors3f)[0]
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = cast(Floats4d, model.get_param("hidden_W").copy())
+    b = cast(Floats2d, model.get_param("hidden_b").copy())
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("hidden_W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("hidden_b", b)
+        else:
+            break
+    return model
+
+
+cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
+    output = model.get_ref("output")
+    cdef np.ndarray hidden_b = model.get_param("hidden_b")
+    cdef np.ndarray output_W = output.get_param("W")
+    cdef np.ndarray output_b = output.get_param("b")
+
+    cdef WeightsC weights
+    weights.feat_weights = feats
+    weights.feat_bias = <const float*>hidden_b.data
+    weights.hidden_weights = <const float *> output_W.data
+    weights.hidden_bias = <const float *> output_b.data
+    weights.seen_mask = <const int8_t*> seen_mask.data
+
+    return weights
+
+
+cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
+    cdef SizesC sizes
+    sizes.states = batch_size
+    sizes.classes = model.get_dim("nO")
+    sizes.hiddens = model.get_dim("nH")
+    sizes.pieces = model.get_dim("nP")
+    sizes.feats = model.get_dim("nF")
+    sizes.embed_width = model.get_dim("nI")
+    sizes.tokens = tokens
+    return sizes
+
+
+cdef ActivationsC _alloc_activations(SizesC n) nogil:
+    cdef ActivationsC A
+    memset(&A, 0, sizeof(A))
+    _resize_activations(&A, n)
+    return A
+
+
+cdef void _free_activations(const ActivationsC* A) nogil:
+    free(A.token_ids)
+    free(A.unmaxed)
+    free(A.hiddens)
+    free(A.is_valid)
+
+
+cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
+    if n.states <= A._max_size:
+        A._curr_size = n.states
+        return
+    if A._max_size == 0:
+        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
+        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    else:
+        A.token_ids = <int*>realloc(A.token_ids,
+            n.states * n.feats * sizeof(A.token_ids[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,
+            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+        A.hiddens = <float*>realloc(A.hiddens,
+            n.states * n.hiddens * sizeof(A.hiddens[0]))
+        A.is_valid = <int*>realloc(A.is_valid,
+            n.states * n.classes * sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    A._curr_size = n.states
+
+
+cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
+    _resize_activations(A, n)
+    for i in range(n.states):
+        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
+    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
+    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
+    for i in range(n.states):
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        for j in range(n.hiddens):
+            index = i * n.hiddens * n.pieces + j * n.pieces
+            which = arg_max(&A.unmaxed[index], n.pieces)
+            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
+    if W.hidden_weights == NULL:
+        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
+    else:
+        # Compute hidden-to-output
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
+                      1.0, <const float *>A.hiddens, n.hiddens,
+                      <const float *>W.hidden_weights, n.hiddens,
+                      0.0, scores, n.classes)
+        # Add bias
+        for i in range(n.states):
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
+    # Set unseen classes to minimum value
+    i = 0
+    min_ = scores[0]
+    for i in range(1, n.states * n.classes):
+        if scores[i] < min_:
+            min_ = scores[i]
+    for i in range(n.states):
+        for j in range(n.classes):
+            if W.seen_mask[j]:
+                scores[i*n.classes+j] = min_
+
+
+cdef void _sum_state_features(CBlas cblas, float* output,
+        const float* cached, const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    cdef int B = n.states
+    cdef int O = n.hiddens * n.pieces
+    cdef int F = n.feats
+    cdef int T = n.tokens
+    padding = cached + (T * F * O)
+    cdef int id_stride = F*O
+    cdef float one = 1.
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                feature = &padding[f*O]
+            else:
+                idx = token_ids[f] * id_stride + f*O
+                feature = &cached[idx]
+            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
+        token_ids += F
+
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pxd b/spacy/pipeline/_parser_internals/_parser_utils.pxd
new file mode 100644
index 00000000000..7fee05bad60
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pxd
@@ -0,0 +1,2 @@
+cdef int arg_max(const float* scores, const int n_classes) nogil
+cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pyx b/spacy/pipeline/_parser_internals/_parser_utils.pyx
new file mode 100644
index 00000000000..582756bf5be
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/_parser_utils.pyx
@@ -0,0 +1,22 @@
+# cython: infer_types=True
+
+cdef inline int arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
+cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if is_valid[i] >= 1:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index c063cf97cd4..1b6b25e5f16 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -8,6 +8,7 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
@@ -27,7 +28,7 @@ cdef struct ArcC:
 
 
 cdef cppclass StateC:
-    int* _heads
+    vector[int] _heads
     const TokenC* _sent
     vector[int] _stack
     vector[int] _rebuffer
@@ -35,31 +36,34 @@ cdef cppclass StateC:
     unordered_map[int, vector[ArcC]] _left_arcs
     unordered_map[int, vector[ArcC]] _right_arcs
     vector[libcpp.bool] _unshiftable
+    vector[int] history
     set[int] _sent_starts
     TokenC _empty_token
     int length
     int offset
     int _b_i
 
-    __init__(const TokenC* sent, int length) nogil:
+    __init__(const TokenC* sent, int length) nogil except +:
+        this._heads.resize(length, -1)
+        this._unshiftable.resize(length, False)
+
+        # Reserve memory ahead of time to minimize allocations during parsing.
+        # The initial capacity set here ideally reflects the expected average-case/majority usage.
+        cdef int init_capacity = 32
+        this._stack.reserve(init_capacity)
+        this._rebuffer.reserve(init_capacity)
+        this._ents.reserve(init_capacity)
+        this._left_arcs.reserve(init_capacity)
+        this._right_arcs.reserve(init_capacity)
+        this.history.reserve(init_capacity)
+
         this._sent = sent
-        this._heads = <int*>calloc(length, sizeof(int))
-        if not (this._sent and this._heads):
-            with gil:
-                PyErr_SetFromErrno(MemoryError)
-                PyErr_CheckSignals()
         this.offset = 0
         this.length = length
         this._b_i = 0
-        for i in range(length):
-            this._heads[i] = -1
-            this._unshiftable.push_back(0)
         memset(&this._empty_token, 0, sizeof(TokenC))
         this._empty_token.lex = &EMPTY_LEXEME
 
-    __dealloc__():
-        free(this._heads)
-
     void set_context_tokens(int* ids, int n) nogil:
         cdef int i, j
         if n == 1:
@@ -132,19 +136,20 @@ cdef cppclass StateC:
                 ids[i] = -1
 
     int S(int i) nogil const:
-        if i >= this._stack.size():
+        cdef int stack_size = this._stack.size()
+        if i >= stack_size or i < 0:
             return -1
-        elif i < 0:
-            return -1
-        return this._stack.at(this._stack.size() - (i+1))
+        else:
+            return this._stack[stack_size - (i+1)]
 
     int B(int i) nogil const:
+        cdef int buf_size = this._rebuffer.size()
         if i < 0:
             return -1
-        elif i < this._rebuffer.size():
-            return this._rebuffer.at(this._rebuffer.size() - (i+1))
+        elif i < buf_size:
+            return this._rebuffer[buf_size - (i+1)]
         else:
-            b_i = this._b_i + (i - this._rebuffer.size())
+            b_i = this._b_i + (i - buf_size)
             if b_i >= this.length:
                 return -1
             else:
@@ -243,7 +248,7 @@ cdef cppclass StateC:
             return 0
         elif this._sent[word].sent_start == 1:
             return 1
-        elif this._sent_starts.count(word) >= 1:
+        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
             return 1
         else:
             return 0
@@ -327,7 +332,7 @@ cdef cppclass StateC:
         if item >= this._unshiftable.size():
             return 0
         else:
-            return this._unshiftable.at(item)
+            return this._unshiftable[item]
 
     void set_reshiftable(int item) nogil:
         if item < this._unshiftable.size():
@@ -347,6 +352,9 @@ cdef cppclass StateC:
         this._heads[child] = head
 
     void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
+        cdef vector[ArcC]* arcs
+        cdef ArcC* arc
+
         arcs_it = heads_arcs.find(h_i)
         if arcs_it == heads_arcs.end():
             return
@@ -355,12 +363,12 @@ cdef cppclass StateC:
         if arcs.size() == 0:
             return
 
-        arc = arcs.back()
+        arc = &arcs.back()
         if arc.head == h_i and arc.child == c_i:
             arcs.pop_back()
         else:
             for i in range(arcs.size()-1):
-                arc = arcs.at(i)
+                arc = &deref(arcs)[i]
                 if arc.head == h_i and arc.child == c_i:
                     arc.head = -1
                     arc.child = -1
@@ -400,10 +408,11 @@ cdef cppclass StateC:
         this._rebuffer = src._rebuffer
         this._sent_starts = src._sent_starts
         this._unshiftable = src._unshiftable
-        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
+        this._heads = src._heads
         this._ents = src._ents
         this._left_arcs = src._left_arcs
         this._right_arcs = src._right_arcs
         this._b_i = src._b_i
         this.offset = src.offset
         this._empty_token = src._empty_token
+        this.history = src.history
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 9dda3bd5e44..462aa820e4f 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -779,6 +779,8 @@ cdef class ArcEager(TransitionSystem):
         return list(arcs)
 
     def has_gold(self, Example eg, start=0, end=None):
+        if end is not None and end < 0:
+            end = None
         for word in eg.y[start:end]:
             if word.dep != 0:
                 return True
@@ -863,6 +865,7 @@ cdef class ArcEager(TransitionSystem):
                             state.print_state()
                         )))
                     action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                     break
             else:
                 failed = False
diff --git a/spacy/pipeline/_parser_internals/batch.pxd b/spacy/pipeline/_parser_internals/batch.pxd
new file mode 100644
index 00000000000..60734e549aa
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/batch.pxd
@@ -0,0 +1,2 @@
+cdef class Batch:
+    pass
diff --git a/spacy/pipeline/_parser_internals/batch.pyx b/spacy/pipeline/_parser_internals/batch.pyx
new file mode 100644
index 00000000000..91073b52e68
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/batch.pyx
@@ -0,0 +1,52 @@
+from typing import Any
+
+TransitionSystem = Any  # TODO
+
+cdef class Batch:
+    def advance(self, scores):
+        raise NotImplementedError
+
+    def get_states(self):
+        raise NotImplementedError
+
+    @property
+    def is_done(self):
+        raise NotImplementedError
+
+    def get_unfinished_states(self):
+        raise NotImplementedError
+
+    def __getitem__(self, i):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+
+class GreedyBatch(Batch):
+    def __init__(self, moves: TransitionSystem, states, golds):
+        self._moves = moves
+        self._states = states
+        self._next_states = [s for s in states if not s.is_final()]
+
+    def advance(self, scores):
+        self._next_states = self._moves.transition_states(self._next_states, scores)
+
+    def advance_with_actions(self, actions):
+        self._next_states = self._moves.apply_actions(self._next_states, actions)
+
+    def get_states(self):
+        return self._states
+
+    @property
+    def is_done(self):
+        return all(s.is_final() for s in self._states)
+
+    def get_unfinished_states(self):
+        return [st for st in self._states if not st.is_final()]
+
+    def __getitem__(self, i):
+        return self._states[i]
+
+    def __len__(self):
+        return len(self._states)
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index a4f7094520c..bd4e06dedb3 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -316,6 +316,8 @@ cdef class BiluoPushDown(TransitionSystem):
             for span in eg.y.spans.get(neg_key, []):
                 if span.start >= start and span.end <= end:
                     return True
+        if end is not None and end < 0:
+            end = None
         for word in eg.y[start:end]:
             if word.ent_iob != 0:
                 return True
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index f25408a13ba..c2a0d22956a 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -21,6 +21,10 @@ cdef class StateClass:
         if self._borrowed != 1:
             del self.c
 
+    @property
+    def history(self):
+        return list(self.c.history)
+
     @property
     def stack(self):
         return [self.S(i) for i in range(self.c.stack_depth())]
@@ -177,3 +181,6 @@ cdef class StateClass:
 
     def clone(self, StateClass src):
         self.c.clone(src.c)
+
+    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
+        self.c.set_context_tokens(&output[row, 0], n_feats)
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 04cd10d8864..66cc7747b69 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -57,3 +57,10 @@ cdef class TransitionSystem:
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        const StateC* state, gold) except -1
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+        int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index 4a0feb435dd..7bd39ba43c5 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -3,6 +3,8 @@
 from __future__ import print_function
 
 from cymem.cymem cimport Pool
+from libc.stdlib cimport calloc, free
+from libcpp.vector cimport vector
 
 from collections import Counter
 
@@ -74,7 +76,18 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
+    def follow_history(self, doc, history):
+        cdef int clas
+        cdef StateClass state = StateClass(doc)
+        for clas in history:
+            action = self.c[clas]
+            action.do(state.c, action.label)
+            state.c.history.push_back(clas)
+        return state
+
     def get_oracle_sequence(self, Example example, _debug=False):
+        if not self.has_gold(example):
+            return []
         states, golds, _ = self.init_gold_batch([example])
         if not states:
             return []
@@ -86,6 +99,8 @@ cdef class TransitionSystem:
             return self.get_oracle_sequence_from_state(state, gold)
 
     def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
+        if state.is_final():
+            return []
         cdef Pool mem = Pool()
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         assert self.n_moves > 0
@@ -111,6 +126,7 @@ cdef class TransitionSystem:
                             "S0 head?", str(state.has_head(state.S(0))),
                         )))
                     action.do(state.c, action.label)
+                    state.c.history.push_back(i)
                     break
             else:
                 if _debug:
@@ -138,6 +154,28 @@ cdef class TransitionSystem:
             raise ValueError(Errors.E170.format(name=name))
         action = self.lookup_transition(name)
         action.do(state.c, action.label)
+        state.c.history.push_back(action.clas)
+
+    def apply_actions(self, states, const int[::1] actions):
+        assert len(states) == actions.shape[0]
+        cdef StateClass state
+        cdef vector[StateC*] c_states
+        c_states.resize(len(states))
+        cdef int i
+        for (i, state) in enumerate(states):
+            c_states[i] = state.c
+        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
+        return [state for state in states if not state.c.is_final()]
+
+    def transition_states(self, states, float[:, ::1] scores):
+        assert len(states) == scores.shape[0]
+        cdef StateClass state
+        cdef float* c_scores = &scores[0, 0]
+        cdef vector[StateC*] c_states
+        for state in states:
+            c_states.push_back(state.c)
+        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
+        return [state for state in states if not state.c.is_final()]
 
     cdef Transition lookup_transition(self, object name) except *:
         raise NotImplementedError
@@ -250,3 +288,35 @@ cdef class TransitionSystem:
             self.cfg.update(msg['cfg'])
         self.initialize_actions(labels)
         return self
+
+
+cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
+    int batch_size) nogil:
+        cdef int i
+        cdef Transition action
+        cdef StateC* state
+        for i in range(batch_size):
+            state = states[i]
+            action = moves.c[actions[i]]
+            action.do(state, action.label)
+            state.history.push_back(action.clas)
+
+
+cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
+    int nr_class, int batch_size) nogil:
+    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
+    cdef int i, guess
+    cdef Transition action
+    for i in range(batch_size):
+        moves.set_valid(is_valid, states[i])
+        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
+        if guess == -1:
+            # This shouldn't happen, but it's hard to raise an error here,
+            # and we don't want to infinite loop. So, force to end state.
+            states[i].force_final()
+        else:
+            action = moves.c[guess]
+            action.do(states[i], action.label)
+            states[i].history.push_back(guess)
+    free(is_valid)
+
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.py
similarity index 97%
rename from spacy/pipeline/dep_parser.pyx
rename to spacy/pipeline/dep_parser.py
index cbd7187ff0f..c996074d2c4 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.py
@@ -5,6 +5,8 @@
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
+from ._parser_internals.arc_eager import ArcEager
 
 from ._parser_internals.arc_eager cimport ArcEager
 from .transition_parser cimport Parser
@@ -22,12 +24,11 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -233,6 +234,7 @@ def parser_score(examples, **kwargs):
 
     DOCS: https://spacy.io/api/dependencyparser#score
     """
+
     def has_sents(doc):
         return doc.has_annotation("SENT_START")
 
@@ -240,8 +242,11 @@ def dep_getter(token, attr):
         dep = getattr(token, attr)
         dep = token.vocab.strings.as_string(dep).lower()
         return dep
+
     results = {}
-    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
+    results.update(
+        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
+    )
     kwargs.setdefault("getter", dep_getter)
     kwargs.setdefault("ignore_labels", ("p", "punct"))
     results.update(Scorer.score_deps(examples, "dep", **kwargs))
@@ -254,11 +259,12 @@ def make_parser_scorer():
     return parser_score
 
 
-cdef class DependencyParser(Parser):
+class DependencyParser(Parser):
     """Pipeline component for dependency parsing.
 
     DOCS: https://spacy.io/api/dependencyparser
     """
+
     TransitionSystem = ArcEager
 
     def __init__(
@@ -278,8 +284,7 @@ def __init__(
         incorrect_spans_key=None,
         scorer=parser_score,
     ):
-        """Create a DependencyParser.
-        """
+        """Create a DependencyParser."""
         super().__init__(
             vocab,
             model,
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.py
similarity index 93%
rename from spacy/pipeline/ner.pyx
rename to spacy/pipeline/ner.py
index fe54d33a17b..41280c49390 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.py
@@ -10,6 +10,13 @@
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser import Parser
+from ._parser_internals.ner import BiluoPushDown
+from ..language import Language
+from ..scorer import get_ner_prf, PRFScore
+from ..training import validate_examples
+from ..util import registry
+from ..training import remove_bilu_prefix
 
 from ._parser_internals.ner cimport BiluoPushDown
 from .transition_parser cimport Parser
@@ -21,12 +28,11 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
-use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -51,8 +57,12 @@
         "incorrect_spans_key": None,
         "scorer": {"@scorers": "spacy.ner_scorer.v1"},
     },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
-
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_ner(
     nlp: Language,
@@ -119,7 +129,12 @@ def make_ner(
         "incorrect_spans_key": None,
         "scorer": None,
     },
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_beam_ner(
     nlp: Language,
@@ -193,11 +208,12 @@ def make_ner_scorer():
     return ner_score
 
 
-cdef class EntityRecognizer(Parser):
+class EntityRecognizer(Parser):
     """Pipeline component for named entity recognition.
 
     DOCS: https://spacy.io/api/entityrecognizer
     """
+
     TransitionSystem = BiluoPushDown
 
     def __init__(
@@ -215,15 +231,14 @@ def __init__(
         incorrect_spans_key=None,
         scorer=ner_score,
     ):
-        """Create an EntityRecognizer.
-        """
+        """Create an EntityRecognizer."""
         super().__init__(
             vocab,
             model,
             name,
             moves,
             update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,   # not relevant for NER
+            min_action_freq=1,  # not relevant for NER
             learn_tokens=False,  # not relevant for NER
             beam_width=beam_width,
             beam_density=beam_density,
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 7c3a9d56249..d9cbf5e8c72 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,6 +17,8 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
+from thinc.api import fix_random_seed
+import logging
 
 from ..util import make_tempdir
 
@@ -413,7 +415,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -540,11 +542,11 @@ def test_block_ner():
     assert [token.ent_type_ for token in doc] == expected_types
 
 
-@pytest.mark.parametrize("use_upper", [True, False])
-def test_overfitting_IO(use_upper):
+def test_overfitting_IO():
+    fix_random_seed(1)
     # Simple test to try and quickly overfit the NER component
     nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
+    ner = nlp.add_pipe("ner", config={"model": {}})
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -576,7 +578,6 @@ def test_overfitting_IO(use_upper):
         assert ents2[0].label_ == "LOC"
         # Ensure that the predictions are still the same, even after adding a new label
         ner2 = nlp2.get_pipe("ner")
-        assert ner2.model.attrs["has_upper"] == use_upper
         ner2.add_label("RANDOM_NEW_LABEL")
         doc3 = nlp2(test_text)
         ents3 = doc3.ents
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index f63d56f6922..9eccb056ce2 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,16 +1,17 @@
+import itertools
 import pytest
+import numpy
 from numpy.testing import assert_equal
 from thinc.api import Adam, fix_random_seed
 
 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.pipeline import DependencyParser
-from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
-from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
-from spacy.tokens import Doc
 from spacy.training import Example
+from spacy.tokens import Doc
 from spacy.vocab import Vocab
+from spacy import util, registry
+from thinc.api import fix_random_seed
 
 from ..util import apply_transition_sequence, make_tempdir
 
@@ -59,6 +60,8 @@
     ),
 ]
 
+PARSERS = ["parser"]  # TODO: Test beam_parser when ready
+
 eps = 0.1
 
 
@@ -171,6 +174,57 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
     assert doc[0].dep != 0
 
 
+def test_parser_apply_actions(en_vocab, en_parser):
+    words = ["I", "ate", "pizza"]
+    words2 = ["Eat", "more", "pizza", "!"]
+    doc1 = Doc(en_vocab, words=words)
+    doc2 = Doc(en_vocab, words=words2)
+    docs = [doc1, doc2]
+
+    moves = en_parser.moves
+    moves.add_action(0, "")
+    moves.add_action(1, "")
+    moves.add_action(2, "nsubj")
+    moves.add_action(3, "obj")
+    moves.add_action(2, "amod")
+
+    actions = [
+        numpy.array([0, 0], dtype="i"),
+        numpy.array([2, 0], dtype="i"),
+        numpy.array([0, 4], dtype="i"),
+        numpy.array([3, 3], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([1, 1], dtype="i"),
+        numpy.array([0], dtype="i"),
+        numpy.array([1], dtype="i"),
+    ]
+
+    states = moves.init_batch(docs)
+    active_states = states
+
+    for step_actions in actions:
+        active_states = moves.apply_actions(active_states, step_actions)
+
+    assert len(active_states) == 0
+
+    for (state, doc) in zip(states, docs):
+        moves.set_annotations(state, doc)
+
+    assert docs[0][0].head.i == 1
+    assert docs[0][0].dep_ == "nsubj"
+    assert docs[0][1].head.i == 1
+    assert docs[0][1].dep_ == "ROOT"
+    assert docs[0][2].head.i == 1
+    assert docs[0][2].dep_ == "obj"
+
+    assert docs[1][0].head.i == 0
+    assert docs[1][0].dep_ == "ROOT"
+    assert docs[1][1].head.i == 2
+    assert docs[1][1].dep_ == "amod"
+    assert docs[1][2].head.i == 0
+    assert docs[1][2].dep_ == "obj"
+
+
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
@@ -319,7 +373,7 @@ def test_parser_constructor(en_vocab):
     DependencyParser(en_vocab, model)
 
 
-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+@pytest.mark.parametrize("pipe_name", PARSERS)
 def test_incomplete_data(pipe_name):
     # Test that the parser works with incomplete information
     nlp = English()
@@ -345,11 +399,15 @@ def test_incomplete_data(pipe_name):
     assert doc[2].head.i == 1
 
 
-@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
-def test_overfitting_IO(pipe_name):
+@pytest.mark.parametrize(
+    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
+)
+def test_overfitting_IO(pipe_name, max_moves):
+    fix_random_seed(0)
     # Simple test to try and quickly overfit the dependency parser (normal or beam)
     nlp = English()
     parser = nlp.add_pipe(pipe_name)
+    parser.cfg["update_with_oracle_cut_size"] = max_moves
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -455,10 +513,12 @@ def test_distill(max_moves):
 @pytest.mark.parametrize(
     "parser_config",
     [
-        # TransitionBasedParser V1
-        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        # TransitionBasedParser V2
+        # TODO: re-enable after we have a spacy-legacy release for v4. See
+        # https://github.com/explosion/spacy-legacy/pull/36
+        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
         ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
+        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
     ],
 )
 # fmt: on
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index c3c4bb6c686..f6cefbc1f84 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -384,7 +384,7 @@ def test_replace_listeners():
     factory = "ner"
 
     [components.ner.model]
-    @architectures = "spacy.TransitionBasedParser.v2"
+    @architectures = "spacy.TransitionBasedParser.v3"
 
     [components.ner.model.tok2vec]
     @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 8a1c74ca9ed..b351ea80121 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -189,33 +189,11 @@
 
 parser_config_string_upper = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v2"
+@architectures = "spacy.TransitionBasedParser.v3"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
-use_upper = true
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = null
-width = 333
-depth = 4
-embed_size = 5555
-window_size = 1
-maxout_pieces = 7
-subword_features = false
-"""
-
-
-parser_config_string_no_upper = """
-[model]
-@architectures = "spacy.TransitionBasedParser.v2"
-state_type = "parser"
-extra_state_tokens = false
-hidden_width = 66
-maxout_pieces = 2
-use_upper = false
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
@@ -246,7 +224,6 @@ def my_parser():
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
-        use_upper=True,
     )
     return parser
 
@@ -360,15 +337,16 @@ def test_serialize_custom_nlp():
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        assert model.get_ref("upper").get_dim("nI") == 65
-        assert model.get_ref("lower").get_dim("nI") == 65
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("W")
+        assert output.has_param("b")
 
 
-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_serialize_parser(parser_config_string):
     """Create a non-default parser config to check nlp serializes it correctly"""
     nlp = English()
@@ -381,11 +359,13 @@ def test_serialize_parser(parser_config_string):
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        model.get_ref("tok2vec")
-        # check that we have the correct settings, not the default ones
-        if model.attrs["has_upper"]:
-            assert model.get_ref("upper").get_dim("nI") == 66
-        assert model.get_ref("lower").get_dim("nI") == 66
+        assert model.get_ref("tok2vec") is not None
+        assert model.has_param("hidden_W")
+        assert model.has_param("hidden_b")
+        output = model.get_ref("output")
+        assert output is not None
+        assert output.has_param("b")
+        assert output.has_param("W")
 
 
 def test_config_nlp_roundtrip():
@@ -581,9 +561,7 @@ def test_config_auto_fill_extra_fields():
     load_model_from_config(nlp.config)
 
 
-@pytest.mark.parametrize(
-    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
-)
+@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
 def test_config_validate_literal(parser_config_string):
     nlp = English()
     config = Config().from_str(parser_config_string)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index d2a41ff0fed..160e2ecf335 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,24 +1,13 @@
 import ctypes
 import os
 from pathlib import Path
-
-import pytest
-
-try:
-    from pydantic.v1 import ValidationError
-except ImportError:
-    from pydantic import ValidationError  # type: ignore
-
-from thinc.api import (
-    Config,
-    ConfigValidationError,
-    CupyOps,
-    MPSOps,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-    set_current_ops,
-)
+from spacy.about import __version__ as spacy_version
+from spacy import util
+from spacy import prefer_gpu, require_gpu, require_cpu
+from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
+from spacy.util import find_available_port
+from thinc.api import Config, Optimizer, ConfigValidationError
+from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
@@ -101,34 +90,6 @@ def test_util_get_package_path(package):
     assert isinstance(path, Path)
 
 
-def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
-    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
-    assert model.get_param("W").shape == (nF, nO, nP, nI)
-    tensor = model.ops.alloc((10, nI))
-    Y, get_dX = model.begin_update(tensor)
-    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
-    dY = model.ops.alloc((15, nO, nP))
-    ids = model.ops.alloc((15, nF))
-    ids[1, 2] = -1
-    dY[1] = 1
-    assert not model.has_grad("pad")
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 2, 0, 0] == 1.0
-    ids.fill(0.0)
-    dY.fill(0.0)
-    dY[0] = 0
-    ids[1, 2] = 0
-    ids[1, 1] = -1
-    ids[1, 0] = -1
-    dY[1] = 1
-    ids[2, 0] = -1
-    dY[2] = 5
-    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
-    assert d_pad[0, 0, 0, 0] == 6
-    assert d_pad[0, 1, 0, 0] == 1
-    assert d_pad[0, 2, 0, 0] == 0
-
-
 def test_prefer_gpu():
     current_ops = get_current_ops()
     if has_cupy_gpu:
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 956234ac0d4..db8f974ea19 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -833,18 +833,17 @@ for a Tok2Vec layer.
 
 ## Parser & NER architectures {id="parser"}
 
-### spacy.TransitionBasedParser.v2 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
+### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TransitionBasedParser.v2"
+> @architectures = "spacy.TransitionBasedParser.v3"
 > state_type = "ner"
 > extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
-> use_upper = true
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v2"
@@ -874,23 +873,22 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
-| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
-| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
-| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
-| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
-| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
-| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
+| Name                 | Description                                                                                                                                                       |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                        |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                               |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                            |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~                                                             |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                       |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                     |
 
 <Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>
 
 [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
 same signature, but the `use_upper` argument was `True` by default.
 
-</Accordion>
+ </Accordion>
 
 ## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx
index b44df538766..44c80622437 100644
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@@ -302,7 +302,7 @@ the others, but may not be as accurate, especially if texts are short.
 ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 
 Identical to
-[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
+[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser)
 except the `use_upper` was set to `True` by default.
 
 ## Layers {id="layers"}

From e4bb27156bbb67d17a0438ee01efb2be68bba952 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 18 Jan 2023 18:28:30 +0100
Subject: [PATCH 485/504] Fix batching regression (#12094)

* Fix batching regression

Some time ago, the spaCy v4 branch switched to the new Thinc v9
schedule. However, this introduced an error in how batching is handed.

In the PR, the batchers were changed to keep track of their step,
so that the step can be passed to the schedule. However, the issue
is that the training loop repeatedly calls the batching functions
(rather than using an infinite generator/iterator). So, the step and
therefore the schedule would be reset each epoch. Before the schedule
switch we didn't have this issue, because the old schedules were
stateful.

This PR fixes this issue by reverting the batching functions to use
a (stateful) generator. Their registry functions do accept a `Schedule`
and we convert `Schedule`s to generators.

* Update batcher docs

* Docstring fixes

* Make minibatch take iterables again as well

* Bump thinc requirement to 9.0.0.dev2

* Use type declaration

* Convert another comment into a proper type declaration
---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 spacy/util.py    | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 58e4382e829..3891d137867 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev1,<9.1.0",
+    "thinc>=9.0.0.dev2,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 5c889c91f81..3a1ef6b70b4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev1,<9.1.0
+thinc>=9.0.0.dev2,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/spacy/util.py b/spacy/util.py
index a76e8f73eeb..20d7cbb5726 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1622,12 +1622,12 @@ def minibatch(items, size):
     so that batch-size can vary on each step.
     """
     if isinstance(size, int):
-        size_ = constant_schedule(size)
+        size_ = itertools.repeat(size)
     else:
         size_ = iter(size)
     items = iter(items)
-    for step in itertools.count():
-        batch_size = size_(step)
+    while True:
+        batch_size = next(size_)
         batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break

From 874c3efad87f606936b22efacf126140eedca1ce Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 27 Jan 2023 15:48:20 +0100
Subject: [PATCH 486/504] Drop python 3.6/3.7, remove unneeded compat (#12187)

* Drop python 3.6/3.7, remove unneeded compat

* Remove unused import

* Minimal python 3.8+ docs updates
---
 spacy/cli/_util.py      | 10 ++++++++++
 spacy/cli/debug_data.py |  8 ++++++++
 spacy/schemas.py        |  9 +++++++++
 spacy/util.py           |  2 +-
 4 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index eed61119070..977912443bd 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,3 +1,10 @@
+from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
+from typing import TYPE_CHECKING, overload
+import sys
+import shutil
+from pathlib import Path
+from wasabi import msg, Printer
+import srsly
 import hashlib
 import os
 import shutil
@@ -27,6 +34,9 @@
 from typer.main import get_command
 from wasabi import Printer, msg
 
+from ..schemas import ProjectConfigSchema, validate
+from ..util import import_file, run_command, make_tempdir, registry, logger
+from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
 from .. import about
 from ..errors import RENAMED_LANGUAGE_CODES
 from ..schemas import ProjectConfigSchema, validate
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 7a98e6d563c..60f760ccb52 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,3 +1,11 @@
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
+from typing import Literal, cast, overload
+from pathlib import Path
+from collections import Counter
+import sys
+import srsly
+from wasabi import Printer, MESSAGES, msg
+import typer
 import math
 import sys
 from collections import Counter
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 6b41bb5b2b7..cf9d3064065 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,3 +1,12 @@
+from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
+from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
+from enum import Enum
+from pydantic import BaseModel, Field, ValidationError, validator, create_model
+from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
+from pydantic.main import ModelMetaclass
+from thinc.api import Optimizer, ConfigValidationError, Model
+from thinc.config import Promise
+from collections import defaultdict
 import inspect
 import re
 from collections import defaultdict
diff --git a/spacy/util.py b/spacy/util.py
index 20d7cbb5726..ae1135234e3 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -65,7 +65,7 @@
 
 
 from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows, importlib_metadata
+from .compat import cupy, CudaStream, is_windows
 from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, is_windows

From aa9d3b8d8838f9b6030c48457d9ad77442c341e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 30 Jan 2023 12:44:11 +0100
Subject: [PATCH 487/504] Add `Language.distill` (#12116)

* Add `Language.distill`

This method is the distillation counterpart of `Language.update`.  It
takes a teacher `Language` instance and distills the student pipes on
the teacher pipes.

* Apply suggestions from code review

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Clarify that how Example is used in distillation

* Update transition parser distill docstring for examples argument

* Pass optimizer to `TrainablePipe.distill`

* Annotate pipe before update

As discussed internally, we want to let a pipe annotate before doing an
update with gold/silver data. Otherwise, the output may be (too)
informed by the gold/silver data.

* Rename `component_map` to `student_to_teacher`

* Better synopsis in `Language.distill` docstring

* `name` -> `student_name`

* Fix labels type in docstring

* Mark distill test as slow

* Fix `student_to_teacher` type in docs

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/tests/test_language.py  |  6 ++++++
 website/docs/api/language.mdx | 28 ++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 941edf0fedc..c6d2508da72 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -91,6 +91,12 @@
 ]
 
 
+TAGGER_TRAIN_DATA = [
+    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
+
 def evil_component(doc):
     if "2" in doc.text:
         raise ValueError("no dice")
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 76743d84f9d..249a65d0d5c 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -417,6 +417,34 @@ Distill the models in a student pipeline from a teacher pipeline.
 | `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                              |
 | **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
 
+## Language.distill {id="distill",tag="method,experimental",version="4"}
+
+Distill the models in a student pipeline from a teacher pipeline.
+
+> #### Example
+>
+> ```python
+>
+> teacher = spacy.load("en_core_web_lg")
+> student = English()
+> student.add_pipe("tagger")
+> student.distill(teacher, examples, sgd=optimizer)
+> ```
+
+| Name                 | Description                                                                                                                                                                                 |
+| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `teacher`            | The teacher pipeline to distill from. ~~Language~~                                                                                                                                          |
+| `examples`           | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
+| _keyword-only_       |                                                                                                                                                                                             |
+| `drop`               | The dropout rate. ~~float~~                                                                                                                                                                 |
+| `sgd`                | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                               |
+| `losses`             | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                                                             |
+| `component_cfg`      | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                              |
+| `exclude`            | Names of components that shouldn't be updated. Defaults to `[]`. ~~Iterable[str]~~                                                                                                          |
+| `annotates`          | Names of components that should set annotations on the prediced examples after updating. Defaults to `[]`. ~~Iterable[str]~~                                                                |
+| `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                              |
+| **RETURNS**          | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                       |
+
 ## Language.rehearse {id="rehearse",tag="method,experimental",version="3"}
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the

From 7cb255ca625b28f3ab636626c60067b53a94d2bb Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 31 Jan 2023 19:31:17 +0900
Subject: [PATCH 488/504] Don't re-download installed models (#12188)

* Don't re-download installed models

When downloading a model, this checks if the same version of the same
model is already installed. If it is then the download is skipped.

This is necessary because pip uses the final download URL for its
caching feature, but because of the way models are hosted on Github,
their URLs change every few minutes.

* Use importlib instead of meta.json

* Use get_package_version

* Add untested, disabled test

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/download.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0b8ed54ed3c..ea7a06c5417 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -8,7 +8,8 @@
 
 from .. import about
 from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version
+from ..util import is_prerelease_version, get_installed_models
+from ..util import get_package_version
 
 
 @app.command(

From 832525f7a70c256767886ee91042c20b26f6a816 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Tue, 31 Jan 2023 17:30:43 +0100
Subject: [PATCH 489/504] Rename language codes (Icelandic, multi-language)
 (#12149)

* Init

* fix tests

* Update spacy/errors.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Fix test_blank_languages

* Rename xx to mul in docs

* Format _util with black

* prettier formatting

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/_util.py   | 1 +
 spacy/cli/convert.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 977912443bd..644f3e5ef24 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -37,6 +37,7 @@
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
 from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
+from ..errors import RENAMED_LANGUAGE_CODES
 from .. import about
 from ..errors import RENAMED_LANGUAGE_CODES
 from ..schemas import ProjectConfigSchema, validate
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index a282e59c749..19591a05c94 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -8,6 +8,8 @@
 import srsly
 from wasabi import Printer
 
+from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
+from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training import docs_to_json
 from ..training.converters import (

From 582c915e8d176395d0b1f6502084045807884f4f Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 1 Feb 2023 17:47:56 +0900
Subject: [PATCH 490/504] Move Entity Linker v1 to spacy-legacy (#12006)

* Move Entity Linker v1 component to spacy-legacy

This is a follow up to #11889 that moves the component instead of
removing it.

In general, we never import from spacy-legacy in spaCy proper. However,
to use this component, that kind of import will be necessary. I was able
to test this without issues, but is this current import strategy
acceptable? Or should we put the component in a registry?

* Use spacy-legacy pr for CI

This will need to be reverted before merging.

* Add temporary step to log installed spacy-legacy version

* Modify requirements.txt to trigger tests

* Add comment to Python to trigger tests

* TODO REVERT This is a commit with logic changes to trigger tests

* Remove pipe from YAML

Works locally, but possibly this is causing a quoting error or
something.

* Revert "TODO REVERT This is a commit with logic changes to trigger tests"

This reverts commit 689fae71f31de4f54a00dd7dae0c26b19563c027.

* Revert "Add comment to Python to trigger tests"

This reverts commit 11840fc59886658c59aeb186a20173f5ec7c4583.

* Add more logging

* Try installing directly in workflow

* Try explicitly uninstalling spacy-legacy first

* Cat requirements.txt to confirm contents

In the branch, the thinc version spec is `thinc>=8.1.0,<8.2.0`. But in
the logs, it's clear that a development release of 9.0 is being
installed. It's not clear why that would happen.

* Log requirements at start of build

* TODO REVERT Change thinc spec

Want to see what happens to the installed thinc spec with this change.

* Update thinc requirements

This makes it the same as it was before the merge, >=8.1.0,<8.2.0.

* Use same thinc version as v4 branch

* TODO REVERT Mark dependency check as xfail

spacy-legacy is specified as a git checkout in requirements.txt while
this PR is in progress, which makes the consistency check here fail.

* Remove debugging output / install step

* Revert "Remove debugging output / install step"

This reverts commit 923ea7448b5e819d73272bc4e43e8880a8598a07.

* Clean up debugging output

The manual install step with the URL fragment seems to have caused
issues on Windows due to the = in the URL being misinterpreted. On the
other hand, removing it seems to mean the git version of spacy-legacy
isn't actually installed.

This PR removes the URL fragment but keeps the direct command-line
install. Additionally, since it looks like this job is configured to use
the default shell (and not bash), it removes a comment that upsets the
Windows cmd shell.

* Revert "TODO REVERT Mark dependency check as xfail"

This reverts commit d4863ec1563b7819c31a865cb94262b7dc592b7e.

* Fix requirements.txt, increasing spacy-legacy version

* Raise spacy legacy version in setup.cfg

* Remove azure build workarounds

* make spacy-legacy version explicit in error message

* Remove debugging line

* Suggestions from code review
---
 spacy/tests/pipeline/test_entity_linker.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 32e7a265f37..33e8d47400e 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1090,6 +1090,8 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
+    from spacy_legacy.components.entity_linker import EntityLinker_v1
+
     # Ensure that the legacy architectures still work
     vector_length = 3
     nlp = English()

From 8a4ead5eec36c71920591749780aae9bcaf8e2af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 21 Feb 2023 15:47:18 +0100
Subject: [PATCH 491/504] Reimplement distillation with oracle cut size
 (#12214)

* Improve the correctness of _parse_patch

* If there are no more actions, do not attempt to make further
  transitions, even if not all states are final.
* Assert that the number of actions for a step is the same as
  the number of states.

* Reimplement distillation with oracle cut size

The code for distillation with an oracle cut size was not reimplemented
after the parser refactor. We did not notice, because we did not have
tests for this functionality. This change brings back the functionality
and adds this to the parser tests.

* Rename states2actions to _states_to_actions for consistency

* Test distillation max cuts in NER

* Mark parser/NER tests as slow

* Typo

* Fix invariant in _states_diff_to_actions

* Rename _init_batch -> _init_batch_from_teacher

* Ninja edit the ninja edit

* Check that we raise an exception when we pass the incorrect number or actions

* Remove unnecessary get

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Write out condition more explicitly

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/ml/tb_framework.pyx        |  4 ++-
 spacy/tests/parser/test_model.py | 61 ++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 spacy/tests/parser/test_model.py

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 79be13b00bd..9b2114900d3 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
     cdef np.ndarray step_actions
 
     scores = []
-    while sizes.states >= 1:
+    while sizes.states >= 1 and (actions is None or len(actions) > 0):
         step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
         step_actions = actions[0] if actions is not None else None
+        assert step_actions is None or step_actions.size == sizes.states, \
+            f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
         with nogil:
             _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
             if actions is None:
diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py
new file mode 100644
index 00000000000..8c1cf7a9346
--- /dev/null
+++ b/spacy/tests/parser/test_model.py
@@ -0,0 +1,61 @@
+import numpy
+import pytest
+
+from spacy.lang.en import English
+from spacy.ml.tb_framework import TransitionModelInputs
+from spacy.training import Example
+
+TRAIN_DATA = [
+    (
+        "They trade mortgage-backed securities.",
+        {
+            "heads": [1, 1, 4, 4, 5, 1, 1],
+            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
+        },
+    ),
+    (
+        "I like London and Berlin.",
+        {
+            "heads": [1, 1, 1, 2, 2, 1],
+            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
+        },
+    ),
+]
+
+
+@pytest.fixture
+def nlp_parser():
+    nlp = English()
+    parser = nlp.add_pipe("parser")
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        for dep in annotations["deps"]:
+            parser.add_label(dep)
+    nlp.initialize()
+
+    return nlp, parser
+
+
+def test_incorrect_number_of_actions(nlp_parser):
+    nlp, parser = nlp_parser
+    doc = nlp.make_doc("test")
+
+    # Too many actions for the number of docs
+    with pytest.raises(AssertionError):
+        parser.model.predict(
+            TransitionModelInputs(
+                docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
+            )
+        )
+
+    # Too few actions for the number of docs
+    with pytest.raises(AssertionError):
+        parser.model.predict(
+            TransitionModelInputs(
+                docs=[doc, doc],
+                moves=parser.moves,
+                actions=[numpy.array([0], dtype="i")],
+            )
+        )

From c2c520870dd110a780fbef2a38fe08790170f2bb Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Tue, 7 Mar 2023 13:10:45 +0100
Subject: [PATCH 492/504] Drop support for EntityLinker_v1. (#12377)

---
 spacy/tests/pipeline/test_entity_linker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 33e8d47400e..9d533a69977 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1090,7 +1090,6 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
-    from spacy_legacy.components.entity_linker import EntityLinker_v1
 
     # Ensure that the legacy architectures still work
     vector_length = 3

From 0095ae4d4ac32bf5d34bc1353cf454f25cc6c887 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 20 Mar 2023 12:25:18 +0100
Subject: [PATCH 493/504] Entity linking: use `SpanGroup` instead of
 `Iterable[Span]` for mentions (#12344)

* Convert Candidate from Cython to Python class.

* Format.

* Fix .entity_ typo in _add_activations() usage.

* Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span].

* Update docs.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update doc string of BaseCandidate.__init__().

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate.

* Adjust Candidate to support and mandate numerical entity IDs.

* Format.

* Fix docstring and docs.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename alias -> mention.

* Refactor Candidate attribute names. Update docs and tests accordingly.

* Refacor Candidate attributes and their usage.

* Format.

* Fix mypy error.

* Update error code in line with v4 convention.

* Reverse erroneous changes during merge.

* Update return type in EL tests.

* Re-add Candidate to setup.py.

* Format updated docs.

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/tests/pipeline/test_entity_linker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 9d533a69977..32e7a265f37 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1090,7 +1090,6 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
-
     # Ensure that the legacy architectures still work
     vector_length = 3
     nlp = English()

From 5a53e5fbedbc5cb80b555a69de6ca45dd64247bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 21 Apr 2023 13:49:40 +0200
Subject: [PATCH 494/504] Add distillation loop (#12542)

* Add distillation initialization and loop

* Fix up configuration keys

* Add docstring

* Type annotations

* init_nlp_distill -> init_nlp_student

* Do not resolve dot name distill corpus in initialization

(Since we don't use it.)

* student: do not request use of optimizer in student pipe

We apply finish up the updates once in the training loop instead.

Also add the necessary logic to `Language.distill` to mirror
`Language.update`.

* Correctly determine sort key in subdivide_batch

* Fix _distill_loop docstring wrt. stopping condition

* _distill_loop: fix distill_data docstring

Make similar changes in train_while_improving, since it also had
incorrect types and missing type annotations.

* Move `set_{gpu_allocator,seed}_from_config` to spacy.util

* Update Language.update docs for the sgd argument

* Type annotation

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 spacy/util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/util.py b/spacy/util.py
index ae1135234e3..624fffe865d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -7,6 +7,7 @@
 import thinc
 from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
 from thinc.api import ConfigValidationError, Model, constant as constant_schedule
+from thinc.api import fix_random_seed, set_gpu_allocator
 import functools
 import itertools
 import logging

From c9e7311f1575857a3982d53596c3b4986147bc5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 26 Jun 2023 11:41:03 +0200
Subject: [PATCH 495/504] isort all the things

---
 spacy/cli/_util.py                          | 11 ------
 spacy/cli/convert.py                        |  2 --
 spacy/cli/debug_data.py                     |  8 -----
 spacy/cli/download.py                       | 12 +++++--
 spacy/ml/tb_framework.pyx                   | 37 +++++++++++++++------
 spacy/pipeline/_parser_internals/_state.pxd |  2 --
 spacy/pipeline/dep_parser.py                |  7 ----
 spacy/pipeline/ner.py                       | 14 --------
 spacy/pipeline/span_ruler.py                |  8 -----
 spacy/pipeline/textcat.py                   |  4 ---
 spacy/schemas.py                            |  9 -----
 spacy/strings.pxd                           |  3 --
 spacy/tests/doc/test_underscore.py          |  1 +
 spacy/tests/parser/test_ner.py              |  2 --
 spacy/tests/parser/test_parse.py            | 10 +++---
 spacy/tests/pipeline/test_entity_ruler.py   |  6 ----
 spacy/tests/test_misc.py                    | 20 +++++++----
 spacy/tokenizer.pxd                         |  4 ---
 spacy/tokens/__init__.py                    |  3 +-
 spacy/tokens/morphanalysis.pyx              |  4 ---
 spacy/tokens/span.pxd                       |  1 -
 spacy/tokens/token.pyx                      |  1 +
 spacy/training/__init__.py                  |  3 --
 spacy/util.py                               | 11 +-----
 24 files changed, 59 insertions(+), 124 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 644f3e5ef24..eed61119070 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,10 +1,3 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
-from typing import TYPE_CHECKING, overload
-import sys
-import shutil
-from pathlib import Path
-from wasabi import msg, Printer
-import srsly
 import hashlib
 import os
 import shutil
@@ -34,10 +27,6 @@
 from typer.main import get_command
 from wasabi import Printer, msg
 
-from ..schemas import ProjectConfigSchema, validate
-from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
-from ..errors import RENAMED_LANGUAGE_CODES
 from .. import about
 from ..errors import RENAMED_LANGUAGE_CODES
 from ..schemas import ProjectConfigSchema, validate
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 19591a05c94..a282e59c749 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -8,8 +8,6 @@
 import srsly
 from wasabi import Printer
 
-from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
-from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training import docs_to_json
 from ..training.converters import (
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 60f760ccb52..7a98e6d563c 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,11 +1,3 @@
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
-from typing import Literal, cast, overload
-from pathlib import Path
-from collections import Counter
-import sys
-import srsly
-from wasabi import Printer, MESSAGES, msg
-import typer
 import math
 import sys
 from collections import Counter
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index ea7a06c5417..0635522930b 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -7,9 +7,15 @@
 from wasabi import msg
 
 from .. import about
-from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version, get_installed_models
-from ..util import get_package_version
+from ..util import (
+    get_installed_models,
+    get_minor_version,
+    get_package_version,
+    is_package,
+    is_prerelease_version,
+    run_command,
+)
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
 
 
 @app.command(
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 9b2114900d3..fd0af12ceab 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -1,28 +1,45 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import List, Tuple, Any, Optional, TypeVar, cast
-from libc.string cimport memset, memcpy
+from typing import Any, List, Optional, Tuple, TypeVar, cast
+
 from libc.stdlib cimport calloc, free, realloc
+from libc.string cimport memcpy, memset
 from libcpp.vector cimport vector
+
 import numpy
+
 cimport numpy as np
-from thinc.api import Model, normal_init, chain, list2array, Linear
-from thinc.api import uniform_init, glorot_uniform_init, zero_init
-from thinc.api import NumpyOps
+
+from thinc.api import (
+    Linear,
+    Model,
+    NumpyOps,
+    chain,
+    glorot_uniform_init,
+    list2array,
+    normal_init,
+    uniform_init,
+    zero_init,
+)
+
 from thinc.backends.cblas cimport CBlas, saxpy, sgemm
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
-from thinc.types import Ints1d, Ints2d
+
+from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
 
 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
 from ..pipeline._parser_internals.batch import GreedyBatch
+
 from ..pipeline._parser_internals._parser_utils cimport arg_max
-from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
-from ..pipeline._parser_internals.transition_system cimport TransitionSystem
 from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
+from ..pipeline._parser_internals.transition_system cimport (
+    TransitionSystem,
+    c_apply_actions,
+    c_transition_batch,
+)
+
 from ..tokens.doc import Doc
 from ..util import registry
 
-
 State = Any  # TODO
 
 
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 1b6b25e5f16..1c61ac271d8 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,5 +1,4 @@
 cimport libcpp
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as incr
 from libc.stdint cimport uint32_t, uint64_t
@@ -8,7 +7,6 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.set cimport set
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.py
index c996074d2c4..b4961487b83 100644
--- a/spacy/pipeline/dep_parser.py
+++ b/spacy/pipeline/dep_parser.py
@@ -4,13 +4,6 @@
 
 from thinc.api import Config, Model
 
-from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
-from ._parser_internals.arc_eager import ArcEager
-
-from ._parser_internals.arc_eager cimport ArcEager
-from .transition_parser cimport Parser
-
 from ..language import Language
 from ..scorer import Scorer
 from ..training import remove_bilu_prefix
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.py
index 41280c49390..1c7cf151385 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.py
@@ -11,20 +11,6 @@
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
 from .transition_parser import Parser
-from ._parser_internals.ner import BiluoPushDown
-from ..language import Language
-from ..scorer import get_ner_prf, PRFScore
-from ..training import validate_examples
-from ..util import registry
-from ..training import remove_bilu_prefix
-
-from ._parser_internals.ner cimport BiluoPushDown
-from .transition_parser cimport Parser
-
-from ..language import Language
-from ..scorer import get_ner_prf
-from ..training import remove_bilu_prefix
-from ..util import registry
 
 default_model_config = """
 [model]
diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index 3f876598013..cd8fea36b47 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -17,14 +17,6 @@
 
 import srsly
 
-from .pipe import Pipe
-from ..training import Example
-from ..language import Language
-from ..errors import Errors, Warnings
-from ..util import ensure_path, SimpleFrozenList, registry
-from ..tokens import Doc, Span
-from ..scorer import Scorer, get_ner_prf
-from ..matcher import Matcher, PhraseMatcher
 from .. import util
 from ..errors import Errors, Warnings
 from ..language import Language
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 79a98b9bc5f..13841dd7bbb 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,7 +1,3 @@
-from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
-from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
-from thinc.types import Floats2d
-import numpy
 from itertools import islice
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
diff --git a/spacy/schemas.py b/spacy/schemas.py
index cf9d3064065..6b41bb5b2b7 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,12 +1,3 @@
-from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
-from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
-from enum import Enum
-from pydantic import BaseModel, Field, ValidationError, validator, create_model
-from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
-from pydantic.main import ModelMetaclass
-from thinc.api import Optimizer, ConfigValidationError, Model
-from thinc.config import Promise
-from collections import defaultdict
 import inspect
 import re
 from collections import defaultdict
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index 688dbc46261..c05731c9a15 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,6 +1,3 @@
-from libc.stdint cimport int64_t, uint32_t
-from libcpp.vector cimport vector
-from libcpp.set cimport set
 from cymem.cymem cimport Pool
 from libc.stdint cimport int64_t, uint32_t
 from libcpp.set cimport set
diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index 37094ac3e06..afac08a18b6 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -4,6 +4,7 @@
 from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
+
 # Helper functions
 def _get_tuple(s: Span):
     return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index d9cbf5e8c72..f0efc3a63fd 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,8 +17,6 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
-from thinc.api import fix_random_seed
-import logging
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 9eccb056ce2..fe82ad2fde0 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,17 +1,19 @@
 import itertools
-import pytest
+
 import numpy
+import pytest
 from numpy.testing import assert_equal
 from thinc.api import Adam, fix_random_seed
 
 from spacy import registry, util
 from spacy.attrs import DEP, NORM
 from spacy.lang.en import English
-from spacy.training import Example
+from spacy.pipeline import DependencyParser
+from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.tokens import Doc
+from spacy.training import Example
 from spacy.vocab import Vocab
-from spacy import util, registry
-from thinc.api import fix_random_seed
 
 from ..util import apply_transition_sequence, make_tempdir
 
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 74731140688..12f2c9def2d 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -2,12 +2,6 @@
 from thinc.api import NumpyOps, get_current_ops
 
 from spacy import registry
-from spacy.tokens import Doc, Span
-from spacy.language import Language
-from spacy.lang.en import English
-from spacy.pipeline import EntityRecognizer, merge_entities
-from spacy.pipeline import SpanRuler
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.errors import MatchPatternError
 from spacy.lang.en import English
 from spacy.language import Language
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 160e2ecf335..c05ef625e11 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,13 +1,19 @@
 import ctypes
 import os
 from pathlib import Path
-from spacy.about import __version__ as spacy_version
-from spacy import util
-from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
-from spacy.util import find_available_port
-from thinc.api import Config, Optimizer, ConfigValidationError
-from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
+
+import pytest
+from pydantic import ValidationError
+from thinc.api import (
+    Config,
+    ConfigValidationError,
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+    set_current_ops,
+)
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 2610532b75d..b2e50969462 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -2,10 +2,6 @@ from cymem.cymem cimport Pool
 from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 
-from .typedefs cimport hash_t
-from .structs cimport LexemeC, SpanC, TokenC
-from .tokens.doc cimport Doc
-from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher
 from .structs cimport LexemeC, SpanC, TokenC
 from .tokens.doc cimport Doc
diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py
index 16c43485340..7617e462fde 100644
--- a/spacy/tokens/__init__.py
+++ b/spacy/tokens/__init__.py
@@ -4,7 +4,6 @@
 from .morphanalysis import MorphAnalysis
 from .span import Span
 from .span_group import SpanGroup
-from .doc_bin import DocBin
-from .morphanalysis import MorphAnalysis
+from .token import Token
 
 __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index b4f7ffbb0d9..f4d5901e590 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -9,10 +9,6 @@ from libcpp.memory cimport shared_ptr
 from ..morphology cimport MorphAnalysisC, check_feature, get_by_field, list_features
 from ..typedefs cimport attr_t, hash_t
 from ..vocab cimport Vocab
-from ..typedefs cimport hash_t, attr_t
-from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
-from libcpp.memory cimport shared_ptr
-from cython.operator cimport dereference as deref
 
 
 cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index 68f722a13cb..fb592e68bd8 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -1,4 +1,3 @@
-from libcpp.memory cimport shared_ptr
 cimport numpy as np
 from libcpp.memory cimport shared_ptr
 
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index f6c6ad8b9a1..017f239147f 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -40,6 +40,7 @@ from .. import parts_of_speech
 from ..attrs import IOB_STRINGS
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
+
 from cython.operator cimport dereference as deref
 
 from cython.operator cimport dereference as deref
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 9445d0b63a5..adfc2bb6658 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,6 +1,3 @@
-from .corpus import Corpus, JsonlCorpus  # noqa: F401
-from .example import Example, validate_examples, validate_get_examples  # noqa: F401
-from .example import validate_distillation_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
diff --git a/spacy/util.py b/spacy/util.py
index 624fffe865d..ae9837e3afe 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -2,13 +2,7 @@
 import importlib
 import importlib.metadata
 import importlib.util
-import re
-from pathlib import Path
-import thinc
-from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
-from thinc.api import ConfigValidationError, Model, constant as constant_schedule
-from thinc.api import fix_random_seed, set_gpu_allocator
-import functools
+import inspect
 import itertools
 import logging
 import os
@@ -65,9 +59,6 @@
     cupy = None
 
 
-from .symbols import ORTH
-from .compat import cupy, CudaStream, is_windows
-from .errors import Errors, Warnings
 from . import about
 from .compat import CudaStream, cupy, is_windows
 from .errors import Errors, Warnings

From 25cd37b1818863559162d9e5de6d5d5d3f487170 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 19 Jul 2023 17:41:29 +0200
Subject: [PATCH 496/504] cython fixes and cleanup

---
 spacy/ml/tb_framework.pyx                     | 55 ++++++++++---------
 .../_parser_internals/transition_system.pxd   |  4 +-
 .../_parser_internals/transition_system.pyx   | 21 ++++---
 spacy/tokens/span.pyx                         |  1 -
 4 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index fd0af12ceab..ed04045a6a5 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -1,5 +1,5 @@
 # cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, TypeVar, cast
+from typing import Any, List, Optional, Tuple, cast
 
 from libc.stdlib cimport calloc, free, realloc
 from libc.string cimport memcpy, memset
@@ -23,7 +23,7 @@ from thinc.api import (
 
 from thinc.backends.cblas cimport CBlas, saxpy, sgemm
 
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
+from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
 
 from ..errors import Errors
 from ..pipeline._parser_internals import _beam_utils
@@ -136,7 +136,7 @@ def init(
     Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
 ):
     if X is not None:
-        docs, moves = X
+        docs, _ = X
         model.get_ref("tok2vec").initialize(X=docs)
     else:
         model.get_ref("tok2vec").initialize()
@@ -145,7 +145,7 @@ def init(
         current_nO = model.maybe_get_dim("nO")
         if current_nO is None or current_nO != inferred_nO:
             model.attrs["resize_output"](model, inferred_nO)
-    nO = model.get_dim("nO")
+    # nO = model.get_dim("nO")
     nP = model.get_dim("nP")
     nH = model.get_dim("nH")
     nI = model.get_dim("nI")
@@ -192,9 +192,10 @@ class TransitionModelInputs:
         self,
         docs: List[Doc],
         moves: TransitionSystem,
-        actions: Optional[List[Ints1d]]=None,
-        max_moves: int=0,
-        states: Optional[List[State]]=None):
+        actions: Optional[List[Ints1d]] = None,
+        max_moves: int = 0,
+        states: Optional[List[State]] = None,
+    ):
         """
         actions (Optional[List[Ints1d]]): actions to apply for each Doc.
         docs (List[Doc]): Docs to predict transition sequences for.
@@ -234,12 +235,12 @@ def forward(model, inputs: TransitionModelInputs, is_train: bool):
         return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
     else:
         return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
-            feats, backprop_feats, seen_mask, is_train, actions=actions,
-            max_moves=inputs.max_moves)
+                                 feats, backprop_feats, seen_mask, is_train, actions=actions,
+                                 max_moves=inputs.max_moves)
 
 
 def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
-                np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
+                        np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None):
     cdef vector[StateC*] c_states
     cdef StateClass state
     for state in states:
@@ -257,9 +258,10 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State
 
     return (states, scores), backprop
 
+
 cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
                        WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i, j
+    cdef int i
     cdef vector[StateC *] unfinished
     cdef ActivationsC activations = _alloc_activations(sizes)
     cdef np.ndarray step_scores
@@ -276,7 +278,7 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
             if actions is None:
                 # Validate actions, argmax, take action.
                 c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
-                    sizes.states)
+                                   sizes.states)
             else:
                 c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
             for i in range(sizes.states):
@@ -302,8 +304,8 @@ def _forward_fallback(
     backprop_feats,
     seen_mask,
     is_train: bool,
-    actions: Optional[List[Ints1d]]=None,
-    max_moves: int=0):
+    actions: Optional[List[Ints1d]] = None,
+        max_moves: int = 0):
     nF = model.get_dim("nF")
     output = model.get_ref("output")
     hidden_b = model.get_param("hidden_b")
@@ -371,7 +373,7 @@ def _forward_fallback(
             for clas in set(model.attrs["unseen_classes"]):
                 if (d_scores[:, clas] < 0).any():
                     model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False
+        d_scores *= seen_mask == False  # no-cython-lint
         # Calculate the gradients for the parameters of the output layer.
         # The weight gemm is (nS, nO) @ (nS, nH).T
         output.inc_grad("b", d_scores.sum(axis=0))
@@ -571,13 +573,13 @@ cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
         A._max_size = n.states
     else:
         A.token_ids = <int*>realloc(A.token_ids,
-            n.states * n.feats * sizeof(A.token_ids[0]))
+                                    n.states * n.feats * sizeof(A.token_ids[0]))
         A.unmaxed = <float*>realloc(A.unmaxed,
-            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
         A.hiddens = <float*>realloc(A.hiddens,
-            n.states * n.hiddens * sizeof(A.hiddens[0]))
+                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
         A.is_valid = <int*>realloc(A.is_valid,
-            n.states * n.classes * sizeof(A.is_valid[0]))
+                                   n.states * n.classes * sizeof(A.is_valid[0]))
         A._max_size = n.states
     A._curr_size = n.states
 
@@ -599,9 +601,9 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
     else:
         # Compute hidden-to-output
         sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-                      1.0, <const float *>A.hiddens, n.hiddens,
-                      <const float *>W.hidden_weights, n.hiddens,
-                      0.0, scores, n.classes)
+                     1.0, <const float *>A.hiddens, n.hiddens,
+                     <const float *>W.hidden_weights, n.hiddens,
+                     0.0, scores, n.classes)
         # Add bias
         for i in range(n.states):
             saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
@@ -617,12 +619,12 @@ cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC**
                 scores[i*n.classes+j] = min_
 
 
-cdef void _sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f, i
+cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
+                              const int* token_ids, SizesC n) nogil:
+    cdef int idx, b, f
     cdef const float* feature
     cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces
+    cdef int O = n.hiddens * n.pieces  # no-cython-lint
     cdef int F = n.feats
     cdef int T = n.tokens
     padding = cached + (T * F * O)
@@ -637,4 +639,3 @@ cdef void _sum_state_features(CBlas cblas, float* output,
                 feature = &cached[idx]
             saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
         token_ids += F
-
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 66cc7747b69..08baed932ba 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -60,7 +60,7 @@ cdef class TransitionSystem:
 
 
 cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-    int batch_size) nogil
+                          int batch_size) nogil
 
 cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-        int nr_class, int batch_size) nogil
+                             int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index 7bd39ba43c5..ae1cf890f3e 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -291,19 +291,19 @@ cdef class TransitionSystem:
 
 
 cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-    int batch_size) nogil:
-        cdef int i
-        cdef Transition action
-        cdef StateC* state
-        for i in range(batch_size):
-            state = states[i]
-            action = moves.c[actions[i]]
-            action.do(state, action.label)
-            state.history.push_back(action.clas)
+                          int batch_size) nogil:
+    cdef int i
+    cdef Transition action
+    cdef StateC* state
+    for i in range(batch_size):
+        state = states[i]
+        action = moves.c[actions[i]]
+        action.do(state, action.label)
+        state.history.push_back(action.clas)
 
 
 cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-    int nr_class, int batch_size) nogil:
+                             int nr_class, int batch_size) nogil:
     is_valid = <int*>calloc(moves.n_moves, sizeof(int))
     cdef int i, guess
     cdef Transition action
@@ -319,4 +319,3 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa
             action.do(states[i], action.label)
             states[i].history.push_back(guess)
     free(is_valid)
-
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 65830ea0f5c..87bff6ac61c 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -948,7 +948,6 @@ cdef class Span:
         def __set__(self, str ent_id_):
             self.id_ = ent_id_
 
-
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
     # better candidates

From 3f7185cb03e4bf67674d4a4fd8fd4533e70ac10b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 14:38:05 +0100
Subject: [PATCH 497/504] Revert "Reimplement distillation with oracle cut size
 (#12214)"

This reverts commit e27c60a70263f7ab17968964de37e938653e37a2.
---
 spacy/ml/tb_framework.pyx        |  4 +--
 spacy/tests/parser/test_model.py | 61 --------------------------------
 spacy/tests/parser/test_ner.py   |  5 +--
 3 files changed, 2 insertions(+), 68 deletions(-)
 delete mode 100644 spacy/tests/parser/test_model.py

diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index ed04045a6a5..b81553323e4 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -268,11 +268,9 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
     cdef np.ndarray step_actions
 
     scores = []
-    while sizes.states >= 1 and (actions is None or len(actions) > 0):
+    while sizes.states >= 1:
         step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
         step_actions = actions[0] if actions is not None else None
-        assert step_actions is None or step_actions.size == sizes.states, \
-            f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
         with nogil:
             _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
             if actions is None:
diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py
deleted file mode 100644
index 8c1cf7a9346..00000000000
--- a/spacy/tests/parser/test_model.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import numpy
-import pytest
-
-from spacy.lang.en import English
-from spacy.ml.tb_framework import TransitionModelInputs
-from spacy.training import Example
-
-TRAIN_DATA = [
-    (
-        "They trade mortgage-backed securities.",
-        {
-            "heads": [1, 1, 4, 4, 5, 1, 1],
-            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
-        },
-    ),
-    (
-        "I like London and Berlin.",
-        {
-            "heads": [1, 1, 1, 2, 2, 1],
-            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
-        },
-    ),
-]
-
-
-@pytest.fixture
-def nlp_parser():
-    nlp = English()
-    parser = nlp.add_pipe("parser")
-
-    train_examples = []
-    for text, annotations in TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-        for dep in annotations["deps"]:
-            parser.add_label(dep)
-    nlp.initialize()
-
-    return nlp, parser
-
-
-def test_incorrect_number_of_actions(nlp_parser):
-    nlp, parser = nlp_parser
-    doc = nlp.make_doc("test")
-
-    # Too many actions for the number of docs
-    with pytest.raises(AssertionError):
-        parser.model.predict(
-            TransitionModelInputs(
-                docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
-            )
-        )
-
-    # Too few actions for the number of docs
-    with pytest.raises(AssertionError):
-        parser.model.predict(
-            TransitionModelInputs(
-                docs=[doc, doc],
-                moves=parser.moves,
-                actions=[numpy.array([0], dtype="i")],
-            )
-        )
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index f0efc3a63fd..bb9b7653ce3 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -623,9 +623,7 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-@pytest.mark.slow
-@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
-def test_distill(max_moves):
+def test_distill():
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -643,7 +641,6 @@ def test_distill(max_moves):
 
     student = English()
     student_ner = student.add_pipe("ner")
-    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )

From 2d314415a2614eda91b06885544fccbbe58c9dad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:23:08 +0100
Subject: [PATCH 498/504] Revert "Merge the parser refactor into `v4` (#10940)"

This reverts commit a183db3cefe0713e828868b43daf61c464cae05c.
---
 setup.py                                      |   6 +-
 spacy/cli/templates/quickstart_training.jinja |  12 +-
 spacy/compat.py                               |   5 +
 spacy/ml/_precomputable_affine.py             | 164 +++++
 spacy/ml/tb_framework.pxd                     |  28 -
 spacy/ml/tb_framework.py                      |  50 ++
 spacy/ml/tb_framework.pyx                     | 639 ------------------
 .../_parser_internals/_parser_utils.pxd       |   2 -
 .../_parser_internals/_parser_utils.pyx       |  22 -
 spacy/pipeline/_parser_internals/_state.pxd   |  60 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |   3 -
 spacy/pipeline/_parser_internals/batch.pxd    |   2 -
 spacy/pipeline/_parser_internals/batch.pyx    |  52 --
 spacy/pipeline/_parser_internals/ner.pyx      |   6 +-
 .../pipeline/_parser_internals/stateclass.pyx |   7 -
 .../_parser_internals/transition_system.pxd   |   7 -
 .../_parser_internals/transition_system.pyx   |  69 --
 .../{dep_parser.py => dep_parser.pyx}         |  20 +-
 spacy/pipeline/{ner.py => ner.pyx}            |  35 +-
 spacy/tests/parser/test_ner.py                |  10 +-
 spacy/tests/parser/test_parse.py              |  74 +-
 spacy/tests/pipeline/test_tok2vec.py          |   2 +-
 .../tests/serialize/test_serialize_config.py  |  56 +-
 spacy/tests/test_misc.py                      |  55 +-
 website/docs/api/architectures.mdx            |  26 +-
 website/docs/api/legacy.mdx                   |   2 +-
 26 files changed, 395 insertions(+), 1019 deletions(-)
 create mode 100644 spacy/ml/_precomputable_affine.py
 delete mode 100644 spacy/ml/tb_framework.pxd
 create mode 100644 spacy/ml/tb_framework.py
 delete mode 100644 spacy/ml/tb_framework.pyx
 delete mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pxd
 delete mode 100644 spacy/pipeline/_parser_internals/_parser_utils.pyx
 delete mode 100644 spacy/pipeline/_parser_internals/batch.pxd
 delete mode 100644 spacy/pipeline/_parser_internals/batch.pyx
 rename spacy/pipeline/{dep_parser.py => dep_parser.pyx} (96%)
 rename spacy/pipeline/{ner.py => ner.pyx} (93%)

diff --git a/setup.py b/setup.py
index a4a87d68aec..0eb529c2098 100755
--- a/setup.py
+++ b/setup.py
@@ -32,10 +32,12 @@
     "spacy.kb.candidate",
     "spacy.kb.kb",
     "spacy.kb.kb_in_memory",
-    "spacy.ml.tb_framework",
+    "spacy.ml.parser_model",
     "spacy.morphology",
+    "spacy.pipeline.dep_parser",
     "spacy.pipeline._edit_tree_internals.edit_trees",
     "spacy.pipeline.morphologizer",
+    "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
     "spacy.pipeline.trainable_pipe",
     "spacy.pipeline.sentencizer",
@@ -43,7 +45,6 @@
     "spacy.pipeline.tagger",
     "spacy.pipeline.transition_parser",
     "spacy.pipeline._parser_internals.arc_eager",
-    "spacy.pipeline._parser_internals.batch",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
     "spacy.pipeline._parser_internals.search",
@@ -51,7 +52,6 @@
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.pipeline._parser_internals._beam_utils",
-    "spacy.pipeline._parser_internals._parser_utils",
     "spacy.tokenizer",
     "spacy.training.align",
     "spacy.training.gold_io",
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 36325711d4d..2817147f3e9 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -90,11 +90,12 @@ grad_factor = 1.0
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
+use_upper = false
 nO = null
 
 [components.parser.model.tok2vec]
@@ -110,11 +111,12 @@ grad_factor = 1.0
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = false
 nO = null
 
 [components.ner.model.tok2vec]
@@ -385,11 +387,12 @@ width = ${components.tok2vec.model.encode.width}
 factory = "parser"
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
+use_upper = true
 nO = null
 
 [components.parser.model.tok2vec]
@@ -402,11 +405,12 @@ width = ${components.tok2vec.model.encode.width}
 factory = "ner"
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
diff --git a/spacy/compat.py b/spacy/compat.py
index 1e63807a0e8..30459e2e495 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,6 +23,11 @@
 except ImportError:
     cupy = None
 
+if sys.version_info[:2] >= (3, 8):  # Python 3.8+
+    from typing import Literal, Protocol, runtime_checkable
+else:
+    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
+
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
new file mode 100644
index 00000000000..1c20c622b2c
--- /dev/null
+++ b/spacy/ml/_precomputable_affine.py
@@ -0,0 +1,164 @@
+from thinc.api import Model, normal_init
+
+from ..util import registry
+
+
+@registry.layers("spacy.PrecomputableAffine.v1")
+def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
+    model = Model(
+        "precomputable_affine",
+        forward,
+        init=init,
+        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
+        params={"W": None, "b": None, "pad": None},
+        attrs={"dropout_rate": dropout},
+    )
+    return model
+
+
+def forward(model, X, is_train):
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.get_param("W")
+    # Preallocate array for layer output, including padding.
+    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
+    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
+    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
+
+    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
+    # change its shape to (nF, nO, nP) without breaking existing models. So
+    # we'll squeeze the first dimension here.
+    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
+
+    def backward(dY_ids):
+        # This backprop is particularly tricky, because we get back a different
+        # thing from what we put out. We put out an array of shape:
+        # (nB, nF, nO, nP), and get back:
+        # (nB, nO, nP) and ids (nB, nF)
+        # The ids tell us the values of nF, so we would have:
+        #
+        # dYf = zeros((nB, nF, nO, nP))
+        # for b in range(nB):
+        #     for f in range(nF):
+        #         dYf[b, ids[b, f]] += dY[b]
+        #
+        # However, we avoid building that array for efficiency -- and just pass
+        # in the indices.
+        dY, ids = dY_ids
+        assert dY.ndim == 3
+        assert dY.shape[1] == nO, dY.shape
+        assert dY.shape[2] == nP, dY.shape
+        # nB = dY.shape[0]
+        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
+        Xf = X[ids]
+        Xf = Xf.reshape((Xf.shape[0], nF * nI))
+
+        model.inc_grad("b", dY.sum(axis=0))
+        dY = dY.reshape((dY.shape[0], nO * nP))
+
+        Wopfi = W.transpose((1, 2, 0, 3))
+        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
+        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
+
+        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
+        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
+        # (o, p, f, i) --> (f, o, p, i)
+        dWopfi = dWopfi.transpose((2, 0, 1, 3))
+        model.inc_grad("W", dWopfi)
+        return dXf.reshape((dXf.shape[0], nF, nI))
+
+    return Yf, backward
+
+
+def _backprop_precomputable_affine_padding(model, dY, ids):
+    nB = dY.shape[0]
+    nF = model.get_dim("nF")
+    nP = model.get_dim("nP")
+    nO = model.get_dim("nO")
+    # Backprop the "padding", used as a filler for missing values.
+    # Values that are missing are set to -1, and each state vector could
+    # have multiple missing values. The padding has different values for
+    # different missing features. The gradient of the padding vector is:
+    #
+    # for b in range(nB):
+    #     for f in range(nF):
+    #         if ids[b, f] < 0:
+    #             d_pad[f] += dY[b]
+    #
+    # Which can be rewritten as:
+    #
+    # (ids < 0).T @ dY
+    mask = model.ops.asarray(ids < 0, dtype="f")
+    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
+    return d_pad.reshape((1, nF, nO, nP))
+
+
+def init(model, X=None, Y=None):
+    """This is like the 'layer sequential unit variance', but instead
+    of taking the actual inputs, we randomly generate whitened data.
+
+    Why's this all so complicated? We have a huge number of inputs,
+    and the maxout unit makes guessing the dynamics tricky. Instead
+    we set the maxout weights to values that empirically result in
+    whitened outputs given whitened inputs.
+    """
+    if model.has_param("W") and model.get_param("W").any():
+        return
+
+    nF = model.get_dim("nF")
+    nO = model.get_dim("nO")
+    nP = model.get_dim("nP")
+    nI = model.get_dim("nI")
+    W = model.ops.alloc4f(nF, nO, nP, nI)
+    b = model.ops.alloc2f(nO, nP)
+    pad = model.ops.alloc4f(1, nF, nO, nP)
+
+    ops = model.ops
+    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
+    pad = normal_init(ops, pad.shape, mean=1.0)
+    model.set_param("W", W)
+    model.set_param("b", b)
+    model.set_param("pad", pad)
+
+    ids = ops.alloc((5000, nF), dtype="f")
+    ids += ops.xp.random.uniform(0, 1000, ids.shape)
+    ids = ops.asarray(ids, dtype="i")
+    tokvecs = ops.alloc((5000, nI), dtype="f")
+    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
+        tokvecs.shape
+    )
+
+    def predict(ids, tokvecs):
+        # nS ids. nW tokvecs. Exclude the padding array.
+        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
+        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
+        # need nS vectors
+        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
+        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
+        vectors = vectors.reshape((vectors.shape[0], nO, nP))
+        vectors += b
+        vectors = model.ops.asarray(vectors)
+        if nP >= 2:
+            return model.ops.maxout(vectors)[0]
+        else:
+            return vectors * (vectors >= 0)
+
+    tol_var = 0.01
+    tol_mean = 0.01
+    t_max = 10
+    W = model.get_param("W").copy()
+    b = model.get_param("b").copy()
+    for t_i in range(t_max):
+        acts1 = predict(ids, tokvecs)
+        var = model.ops.xp.var(acts1)
+        mean = model.ops.xp.mean(acts1)
+        if abs(var - 1.0) >= tol_var:
+            W /= model.ops.xp.sqrt(var)
+            model.set_param("W", W)
+        elif abs(mean) >= tol_mean:
+            b -= mean
+            model.set_param("b", b)
+        else:
+            break
diff --git a/spacy/ml/tb_framework.pxd b/spacy/ml/tb_framework.pxd
deleted file mode 100644
index 965508519e8..00000000000
--- a/spacy/ml/tb_framework.pxd
+++ /dev/null
@@ -1,28 +0,0 @@
-from libc.stdint cimport int8_t
-
-
-cdef struct SizesC:
-    int states
-    int classes
-    int hiddens
-    int pieces
-    int feats
-    int embed_width
-    int tokens
-
-
-cdef struct WeightsC:
-    const float* feat_weights
-    const float* feat_bias
-    const float* hidden_bias
-    const float* hidden_weights
-    const int8_t* seen_mask
-
-
-cdef struct ActivationsC:
-    int* token_ids
-    float* unmaxed
-    float* hiddens
-    int* is_valid
-    int _curr_size
-    int _max_size
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
new file mode 100644
index 00000000000..ab4a969e24e
--- /dev/null
+++ b/spacy/ml/tb_framework.py
@@ -0,0 +1,50 @@
+from thinc.api import Model, noop
+from .parser_model import ParserStepModel
+from ..util import registry
+
+
+@registry.layers("spacy.TransitionModel.v1")
+def TransitionModel(
+    tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
+):
+    """Set up a stepwise transition-based model"""
+    if upper is None:
+        has_upper = False
+        upper = noop()
+    else:
+        has_upper = True
+    # don't define nO for this object, because we can't dynamically change it
+    return Model(
+        name="parser_model",
+        forward=forward,
+        dims={"nI": tok2vec.maybe_get_dim("nI")},
+        layers=[tok2vec, lower, upper],
+        refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
+        init=init,
+        attrs={
+            "has_upper": has_upper,
+            "unseen_classes": set(unseen_classes),
+            "resize_output": resize_output,
+        },
+    )
+
+
+def forward(model, X, is_train):
+    step_model = ParserStepModel(
+        X,
+        model.layers,
+        unseen_classes=model.attrs["unseen_classes"],
+        train=is_train,
+        has_upper=model.attrs["has_upper"],
+    )
+
+    return step_model, step_model.finish_steps
+
+
+def init(model, X=None, Y=None):
+    model.get_ref("tok2vec").initialize(X=X)
+    lower = model.get_ref("lower")
+    lower.initialize()
+    if model.attrs["has_upper"]:
+        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
+        model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
deleted file mode 100644
index b81553323e4..00000000000
--- a/spacy/ml/tb_framework.pyx
+++ /dev/null
@@ -1,639 +0,0 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
-from typing import Any, List, Optional, Tuple, cast
-
-from libc.stdlib cimport calloc, free, realloc
-from libc.string cimport memcpy, memset
-from libcpp.vector cimport vector
-
-import numpy
-
-cimport numpy as np
-
-from thinc.api import (
-    Linear,
-    Model,
-    NumpyOps,
-    chain,
-    glorot_uniform_init,
-    list2array,
-    normal_init,
-    uniform_init,
-    zero_init,
-)
-
-from thinc.backends.cblas cimport CBlas, saxpy, sgemm
-
-from thinc.types import Floats2d, Floats3d, Floats4d, Ints1d, Ints2d
-
-from ..errors import Errors
-from ..pipeline._parser_internals import _beam_utils
-from ..pipeline._parser_internals.batch import GreedyBatch
-
-from ..pipeline._parser_internals._parser_utils cimport arg_max
-from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
-from ..pipeline._parser_internals.transition_system cimport (
-    TransitionSystem,
-    c_apply_actions,
-    c_transition_batch,
-)
-
-from ..tokens.doc import Doc
-from ..util import registry
-
-State = Any  # TODO
-
-
-@registry.layers("spacy.TransitionModel.v2")
-def TransitionModel(
-    *,
-    tok2vec: Model[List[Doc], List[Floats2d]],
-    beam_width: int = 1,
-    beam_density: float = 0.0,
-    state_tokens: int,
-    hidden_width: int,
-    maxout_pieces: int,
-    nO: Optional[int] = None,
-    unseen_classes=set(),
-) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
-    """Set up a transition-based parsing model, using a maxout hidden
-    layer and a linear output layer.
-    """
-    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))  # type: ignore
-    tok2vec_projected.set_dim("nO", hidden_width)
-
-    # FIXME: we use `output` as a container for the output layer's
-    # weights and biases. Thinc optimizers cannot handle resizing
-    # of parameters. So, when the parser model is resized, we
-    # construct a new `output` layer, which has a different key in
-    # the optimizer. Once the optimizer supports parameter resizing,
-    # we can replace the `output` layer by `output_W` and `output_b`
-    # parameters in this model.
-    output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
-
-    return Model(
-        name="parser_model",
-        forward=forward,
-        init=init,
-        layers=[tok2vec_projected, output],
-        refs={
-            "tok2vec": tok2vec_projected,
-            "output": output,
-        },
-        params={
-            "hidden_W": None,  # Floats2d W for the hidden layer
-            "hidden_b": None,  # Floats1d bias for the hidden layer
-            "hidden_pad": None,  # Floats1d padding for the hidden layer
-        },
-        dims={
-            "nO": None,  # Output size
-            "nP": maxout_pieces,
-            "nH": hidden_width,
-            "nI": tok2vec_projected.maybe_get_dim("nO"),
-            "nF": state_tokens,
-        },
-        attrs={
-            "beam_width": beam_width,
-            "beam_density": beam_density,
-            "unseen_classes": set(unseen_classes),
-            "resize_output": resize_output,
-        },
-    )
-
-
-def resize_output(model: Model, new_nO: int) -> Model:
-    old_nO = model.maybe_get_dim("nO")
-    output = model.get_ref("output")
-    if old_nO is None:
-        model.set_dim("nO", new_nO)
-        output.set_dim("nO", new_nO)
-        output.initialize()
-        return model
-    elif new_nO <= old_nO:
-        return model
-    elif output.has_param("W"):
-        nH = model.get_dim("nH")
-        new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
-        new_output.initialize()
-        new_W = new_output.get_param("W")
-        new_b = new_output.get_param("b")
-        old_W = output.get_param("W")
-        old_b = output.get_param("b")
-        new_W[:old_nO] = old_W  # type: ignore
-        new_b[:old_nO] = old_b  # type: ignore
-        for i in range(old_nO, new_nO):
-            model.attrs["unseen_classes"].add(i)
-        model.layers[-1] = new_output
-        model.set_ref("output", new_output)
-    # TODO: Avoid this private intrusion
-    model._dims["nO"] = new_nO
-    return model
-
-
-def init(
-    model,
-    X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
-    Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
-):
-    if X is not None:
-        docs, _ = X
-        model.get_ref("tok2vec").initialize(X=docs)
-    else:
-        model.get_ref("tok2vec").initialize()
-    inferred_nO = _infer_nO(Y)
-    if inferred_nO is not None:
-        current_nO = model.maybe_get_dim("nO")
-        if current_nO is None or current_nO != inferred_nO:
-            model.attrs["resize_output"](model, inferred_nO)
-    # nO = model.get_dim("nO")
-    nP = model.get_dim("nP")
-    nH = model.get_dim("nH")
-    nI = model.get_dim("nI")
-    nF = model.get_dim("nF")
-    ops = model.ops
-
-    Wl = ops.alloc2f(nH * nP, nF * nI)
-    bl = ops.alloc1f(nH * nP)
-    padl = ops.alloc1f(nI)
-    # Wl = zero_init(ops, Wl.shape)
-    Wl = glorot_uniform_init(ops, Wl.shape)
-    padl = uniform_init(ops, padl.shape)  # type: ignore
-    # TODO: Experiment with whether better to initialize output_W
-    model.set_param("hidden_W", Wl)
-    model.set_param("hidden_b", bl)
-    model.set_param("hidden_pad", padl)
-    # model = _lsuv_init(model)
-    return model
-
-
-class TransitionModelInputs:
-    """
-    Input to transition model.
-    """
-
-    # dataclass annotation is not yet supported in Cython 0.29.x,
-    # so, we'll do something close to it.
-
-    actions: Optional[List[Ints1d]]
-    docs: List[Doc]
-    max_moves: int
-    moves: TransitionSystem
-    states: Optional[List[State]]
-
-    __slots__ = [
-        "actions",
-        "docs",
-        "max_moves",
-        "moves",
-        "states",
-    ]
-
-    def __init__(
-        self,
-        docs: List[Doc],
-        moves: TransitionSystem,
-        actions: Optional[List[Ints1d]] = None,
-        max_moves: int = 0,
-        states: Optional[List[State]] = None,
-    ):
-        """
-        actions (Optional[List[Ints1d]]): actions to apply for each Doc.
-        docs (List[Doc]): Docs to predict transition sequences for.
-        max_moves: (int): the maximum number of moves to apply, values less
-            than 1 will apply moves to states until they are final states.
-        moves (TransitionSystem): the transition system to use when predicting
-            the transition sequences.
-        states (Optional[List[States]]): the initial states to predict the
-            transition sequences for. When absent, the initial states are
-            initialized from the provided Docs.
-        """
-        self.actions = actions
-        self.docs = docs
-        self.moves = moves
-        self.max_moves = max_moves
-        self.states = states
-
-
-def forward(model, inputs: TransitionModelInputs, is_train: bool):
-    docs = inputs.docs
-    moves = inputs.moves
-    actions = inputs.actions
-
-    beam_width = model.attrs["beam_width"]
-    hidden_pad = model.get_param("hidden_pad")
-    tok2vec = model.get_ref("tok2vec")
-
-    states = moves.init_batch(docs) if inputs.states is None else inputs.states
-    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
-    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
-    feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
-    seen_mask = _get_seen_mask(model)
-
-    if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
-        # Note: max_moves is only used during training, so we don't need to
-        #       pass it to the greedy inference path.
-        return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
-    else:
-        return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
-                                 feats, backprop_feats, seen_mask, is_train, actions=actions,
-                                 max_moves=inputs.max_moves)
-
-
-def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
-                        np.ndarray[np.npy_bool, ndim = 1] seen_mask, actions: Optional[List[Ints1d]] = None):
-    cdef vector[StateC*] c_states
-    cdef StateClass state
-    for state in states:
-        if not state.is_final():
-            c_states.push_back(state.c)
-    weights = _get_c_weights(model, <float*>feats.data, seen_mask)
-    # Precomputed features have rows for each token, plus one for padding.
-    cdef int n_tokens = feats.shape[0] - 1
-    sizes = _get_c_sizes(model, c_states.size(), n_tokens)
-    cdef CBlas cblas = model.ops.cblas()
-    scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
-
-    def backprop(dY):
-        raise ValueError(Errors.E4004)
-
-    return (states, scores), backprop
-
-
-cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
-                       WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
-    cdef int i
-    cdef vector[StateC *] unfinished
-    cdef ActivationsC activations = _alloc_activations(sizes)
-    cdef np.ndarray step_scores
-    cdef np.ndarray step_actions
-
-    scores = []
-    while sizes.states >= 1:
-        step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
-        step_actions = actions[0] if actions is not None else None
-        with nogil:
-            _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
-            if actions is None:
-                # Validate actions, argmax, take action.
-                c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
-                                   sizes.states)
-            else:
-                c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
-            for i in range(sizes.states):
-                if not states[i].is_final():
-                    unfinished.push_back(states[i])
-            for i in range(unfinished.size()):
-                states[i] = unfinished[i]
-        sizes.states = unfinished.size()
-        scores.append(step_scores)
-        unfinished.clear()
-        actions = actions[1:] if actions is not None else None
-    _free_activations(&activations)
-
-    return scores
-
-
-def _forward_fallback(
-    model: Model,
-    moves: TransitionSystem,
-    states: List[StateClass],
-    tokvecs, backprop_tok2vec,
-    feats,
-    backprop_feats,
-    seen_mask,
-    is_train: bool,
-    actions: Optional[List[Ints1d]] = None,
-        max_moves: int = 0):
-    nF = model.get_dim("nF")
-    output = model.get_ref("output")
-    hidden_b = model.get_param("hidden_b")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-
-    beam_width = model.attrs["beam_width"]
-    beam_density = model.attrs["beam_density"]
-
-    ops = model.ops
-
-    all_ids = []
-    all_which = []
-    all_statevecs = []
-    all_scores = []
-    if beam_width == 1:
-        batch = GreedyBatch(moves, states, None)
-    else:
-        batch = _beam_utils.BeamBatch(
-            moves, states, None, width=beam_width, density=beam_density
-        )
-    arange = ops.xp.arange(nF)
-    n_moves = 0
-    while not batch.is_done:
-        ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
-        for i, state in enumerate(batch.get_unfinished_states()):
-            state.set_context_tokens(ids, i, nF)
-        # Sum the state features, add the bias and apply the activation (maxout)
-        # to create the state vectors.
-        preacts2f = feats[ids, arange].sum(axis=1)  # type: ignore
-        preacts2f += hidden_b
-        preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
-        assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
-        statevecs, which = ops.maxout(preacts)
-        # We don't use output's backprop, since we want to backprop for
-        # all states at once, rather than a single state.
-        scores = output.predict(statevecs)
-        scores[:, seen_mask] = ops.xp.nanmin(scores)
-        # Transition the states, filtering out any that are finished.
-        cpu_scores = ops.to_numpy(scores)
-        if actions is None:
-            batch.advance(cpu_scores)
-        else:
-            batch.advance_with_actions(actions[0])
-            actions = actions[1:]
-        all_scores.append(scores)
-        if is_train:
-            # Remember intermediate results for the backprop.
-            all_ids.append(ids)
-            all_statevecs.append(statevecs)
-            all_which.append(which)
-        if n_moves >= max_moves >= 1:
-            break
-        n_moves += 1
-
-    def backprop_parser(d_states_d_scores):
-        ids = ops.xp.vstack(all_ids)
-        which = ops.xp.vstack(all_which)
-        statevecs = ops.xp.vstack(all_statevecs)
-        _, d_scores = d_states_d_scores
-        if model.attrs.get("unseen_classes"):
-            # If we have a negative gradient (i.e. the probability should
-            # increase) on any classes we filtered out as unseen, mark
-            # them as seen.
-            for clas in set(model.attrs["unseen_classes"]):
-                if (d_scores[:, clas] < 0).any():
-                    model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False  # no-cython-lint
-        # Calculate the gradients for the parameters of the output layer.
-        # The weight gemm is (nS, nO) @ (nS, nH).T
-        output.inc_grad("b", d_scores.sum(axis=0))
-        output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
-        # Now calculate d_statevecs, by backproping through the output linear layer.
-        # This gemm is (nS, nO) @ (nO, nH)
-        output_W = output.get_param("W")
-        d_statevecs = ops.gemm(d_scores, output_W)
-        # Backprop through the maxout activation
-        d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
-        d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
-        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
-        # We don't need to backprop the summation, because we pass back the IDs instead
-        d_state_features = backprop_feats((d_preacts2f, ids))
-        d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
-        ops.scatter_add(d_tokvecs, ids, d_state_features)
-        model.inc_grad("hidden_pad", d_tokvecs[-1])
-        return (backprop_tok2vec(d_tokvecs[:-1]), None)
-
-    return (list(batch), all_scores), backprop_parser
-
-
-def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
-    mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
-    for class_ in model.attrs.get("unseen_classes", set()):
-        mask[class_] = True
-    return mask
-
-
-def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
-    W: Floats2d = model.get_param("hidden_W")
-    nF = model.get_dim("nF")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
-    W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
-    W3f = W3f.transpose((1, 0, 2))
-    W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
-    assert X.shape == (X.shape[0], nI), X.shape
-    Yf_ = model.ops.gemm(X, W2f, trans2=True)
-    Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
-
-    def backward(dY_ids: Tuple[Floats3d, Ints2d]):
-        # This backprop is particularly tricky, because we get back a different
-        # thing from what we put out. We put out an array of shape:
-        # (nB, nF, nH, nP), and get back:
-        # (nB, nH, nP) and ids (nB, nF)
-        # The ids tell us the values of nF, so we would have:
-        #
-        # dYf = zeros((nB, nF, nH, nP))
-        # for b in range(nB):
-        #     for f in range(nF):
-        #         dYf[b, ids[b, f]] += dY[b]
-        #
-        # However, we avoid building that array for efficiency -- and just pass
-        # in the indices.
-        dY, ids = dY_ids
-        dXf = model.ops.gemm(dY, W)
-        Xf = X[ids].reshape((ids.shape[0], -1))
-        dW = model.ops.gemm(dY, Xf, trans1=True)
-        model.inc_grad("hidden_W", dW)
-        return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
-
-    return Yf, backward
-
-
-def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
-    if Y is None:
-        return None
-    _, scores = Y
-    if len(scores) == 0:
-        return None
-    assert scores[0].shape[0] >= 1
-    assert len(scores[0].shape) == 2
-    return scores[0].shape[1]
-
-
-def _lsuv_init(model: Model):
-    """This is like the 'layer sequential unit variance', but instead
-    of taking the actual inputs, we randomly generate whitened data.
-
-    Why's this all so complicated? We have a huge number of inputs,
-    and the maxout unit makes guessing the dynamics tricky. Instead
-    we set the maxout weights to values that empirically result in
-    whitened outputs given whitened inputs.
-    """
-    W = model.maybe_get_param("hidden_W")
-    if W is not None and W.any():
-        return
-
-    nF = model.get_dim("nF")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-    nI = model.get_dim("nI")
-    W = model.ops.alloc4f(nF, nH, nP, nI)
-    b = model.ops.alloc2f(nH, nP)
-    pad = model.ops.alloc4f(1, nF, nH, nP)
-
-    ops = model.ops
-    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
-    pad = normal_init(ops, pad.shape, mean=1.0)
-    model.set_param("W", W)
-    model.set_param("b", b)
-    model.set_param("pad", pad)
-
-    ids = ops.alloc_f((5000, nF), dtype="f")
-    ids += ops.xp.random.uniform(0, 1000, ids.shape)
-    ids = ops.asarray(ids, dtype="i")
-    tokvecs = ops.alloc_f((5000, nI), dtype="f")
-    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-        tokvecs.shape
-    )
-
-    def predict(ids, tokvecs):
-        # nS ids. nW tokvecs. Exclude the padding array.
-        hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
-        vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
-        # need nS vectors
-        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
-        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-        vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
-        vectors3f += b
-        return model.ops.maxout(vectors3f)[0]
-
-    tol_var = 0.01
-    tol_mean = 0.01
-    t_max = 10
-    W = cast(Floats4d, model.get_param("hidden_W").copy())
-    b = cast(Floats2d, model.get_param("hidden_b").copy())
-    for t_i in range(t_max):
-        acts1 = predict(ids, tokvecs)
-        var = model.ops.xp.var(acts1)
-        mean = model.ops.xp.mean(acts1)
-        if abs(var - 1.0) >= tol_var:
-            W /= model.ops.xp.sqrt(var)
-            model.set_param("hidden_W", W)
-        elif abs(mean) >= tol_mean:
-            b -= mean
-            model.set_param("hidden_b", b)
-        else:
-            break
-    return model
-
-
-cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
-    output = model.get_ref("output")
-    cdef np.ndarray hidden_b = model.get_param("hidden_b")
-    cdef np.ndarray output_W = output.get_param("W")
-    cdef np.ndarray output_b = output.get_param("b")
-
-    cdef WeightsC weights
-    weights.feat_weights = feats
-    weights.feat_bias = <const float*>hidden_b.data
-    weights.hidden_weights = <const float *> output_W.data
-    weights.hidden_bias = <const float *> output_b.data
-    weights.seen_mask = <const int8_t*> seen_mask.data
-
-    return weights
-
-
-cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
-    cdef SizesC sizes
-    sizes.states = batch_size
-    sizes.classes = model.get_dim("nO")
-    sizes.hiddens = model.get_dim("nH")
-    sizes.pieces = model.get_dim("nP")
-    sizes.feats = model.get_dim("nF")
-    sizes.embed_width = model.get_dim("nI")
-    sizes.tokens = tokens
-    return sizes
-
-
-cdef ActivationsC _alloc_activations(SizesC n) nogil:
-    cdef ActivationsC A
-    memset(&A, 0, sizeof(A))
-    _resize_activations(&A, n)
-    return A
-
-
-cdef void _free_activations(const ActivationsC* A) nogil:
-    free(A.token_ids)
-    free(A.unmaxed)
-    free(A.hiddens)
-    free(A.is_valid)
-
-
-cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
-    if n.states <= A._max_size:
-        A._curr_size = n.states
-        return
-    if A._max_size == 0:
-        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
-        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
-        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    else:
-        A.token_ids = <int*>realloc(A.token_ids,
-                                    n.states * n.feats * sizeof(A.token_ids[0]))
-        A.unmaxed = <float*>realloc(A.unmaxed,
-                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
-        A.hiddens = <float*>realloc(A.hiddens,
-                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
-        A.is_valid = <int*>realloc(A.is_valid,
-                                   n.states * n.classes * sizeof(A.is_valid[0]))
-        A._max_size = n.states
-    A._curr_size = n.states
-
-
-cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
-    _resize_activations(A, n)
-    for i in range(n.states):
-        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
-    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
-    _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
-    for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
-        for j in range(n.hiddens):
-            index = i * n.hiddens * n.pieces + j * n.pieces
-            which = arg_max(&A.unmaxed[index], n.pieces)
-            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
-    if W.hidden_weights == NULL:
-        memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
-    else:
-        # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-                     1.0, <const float *>A.hiddens, n.hiddens,
-                     <const float *>W.hidden_weights, n.hiddens,
-                     0.0, scores, n.classes)
-        # Add bias
-        for i in range(n.states):
-            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
-    # Set unseen classes to minimum value
-    i = 0
-    min_ = scores[0]
-    for i in range(1, n.states * n.classes):
-        if scores[i] < min_:
-            min_ = scores[i]
-    for i in range(n.states):
-        for j in range(n.classes):
-            if W.seen_mask[j]:
-                scores[i*n.classes+j] = min_
-
-
-cdef void _sum_state_features(CBlas cblas, float* output, const float* cached,
-                              const int* token_ids, SizesC n) nogil:
-    cdef int idx, b, f
-    cdef const float* feature
-    cdef int B = n.states
-    cdef int O = n.hiddens * n.pieces  # no-cython-lint
-    cdef int F = n.feats
-    cdef int T = n.tokens
-    padding = cached + (T * F * O)
-    cdef int id_stride = F*O
-    cdef float one = 1.
-    for b in range(B):
-        for f in range(F):
-            if token_ids[f] < 0:
-                feature = &padding[f*O]
-            else:
-                idx = token_ids[f] * id_stride + f*O
-                feature = &cached[idx]
-            saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
-        token_ids += F
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pxd b/spacy/pipeline/_parser_internals/_parser_utils.pxd
deleted file mode 100644
index 7fee05bad60..00000000000
--- a/spacy/pipeline/_parser_internals/_parser_utils.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef int arg_max(const float* scores, const int n_classes) nogil
-cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pyx b/spacy/pipeline/_parser_internals/_parser_utils.pyx
deleted file mode 100644
index 582756bf5be..00000000000
--- a/spacy/pipeline/_parser_internals/_parser_utils.pyx
+++ /dev/null
@@ -1,22 +0,0 @@
-# cython: infer_types=True
-
-cdef inline int arg_max(const float* scores, const int n_classes) nogil:
-    if n_classes == 2:
-        return 0 if scores[0] > scores[1] else 1
-    cdef int i
-    cdef int best = 0
-    cdef float mode = scores[0]
-    for i in range(1, n_classes):
-        if scores[i] > mode:
-            mode = scores[i]
-            best = i
-    return best
-
-
-cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
-    cdef int best = -1
-    for i in range(n):
-        if is_valid[i] >= 1:
-            if best == -1 or scores[i] > scores[best]:
-                best = i
-    return best
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 1c61ac271d8..04274ce8af1 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -7,6 +7,8 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
+from libcpp.set cimport set
+from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
@@ -26,7 +28,7 @@ cdef struct ArcC:
 
 
 cdef cppclass StateC:
-    vector[int] _heads
+    int* _heads
     const TokenC* _sent
     vector[int] _stack
     vector[int] _rebuffer
@@ -34,34 +36,31 @@ cdef cppclass StateC:
     unordered_map[int, vector[ArcC]] _left_arcs
     unordered_map[int, vector[ArcC]] _right_arcs
     vector[libcpp.bool] _unshiftable
-    vector[int] history
     set[int] _sent_starts
     TokenC _empty_token
     int length
     int offset
     int _b_i
 
-    __init__(const TokenC* sent, int length) nogil except +:
-        this._heads.resize(length, -1)
-        this._unshiftable.resize(length, False)
-
-        # Reserve memory ahead of time to minimize allocations during parsing.
-        # The initial capacity set here ideally reflects the expected average-case/majority usage.
-        cdef int init_capacity = 32
-        this._stack.reserve(init_capacity)
-        this._rebuffer.reserve(init_capacity)
-        this._ents.reserve(init_capacity)
-        this._left_arcs.reserve(init_capacity)
-        this._right_arcs.reserve(init_capacity)
-        this.history.reserve(init_capacity)
-
+    __init__(const TokenC* sent, int length) nogil:
         this._sent = sent
+        this._heads = <int*>calloc(length, sizeof(int))
+        if not (this._sent and this._heads):
+            with gil:
+                PyErr_SetFromErrno(MemoryError)
+                PyErr_CheckSignals()
         this.offset = 0
         this.length = length
         this._b_i = 0
+        for i in range(length):
+            this._heads[i] = -1
+            this._unshiftable.push_back(0)
         memset(&this._empty_token, 0, sizeof(TokenC))
         this._empty_token.lex = &EMPTY_LEXEME
 
+    __dealloc__():
+        free(this._heads)
+
     void set_context_tokens(int* ids, int n) nogil:
         cdef int i, j
         if n == 1:
@@ -134,20 +133,19 @@ cdef cppclass StateC:
                 ids[i] = -1
 
     int S(int i) nogil const:
-        cdef int stack_size = this._stack.size()
-        if i >= stack_size or i < 0:
+        if i >= this._stack.size():
             return -1
-        else:
-            return this._stack[stack_size - (i+1)]
+        elif i < 0:
+            return -1
+        return this._stack.at(this._stack.size() - (i+1))
 
     int B(int i) nogil const:
-        cdef int buf_size = this._rebuffer.size()
         if i < 0:
             return -1
-        elif i < buf_size:
-            return this._rebuffer[buf_size - (i+1)]
+        elif i < this._rebuffer.size():
+            return this._rebuffer.at(this._rebuffer.size() - (i+1))
         else:
-            b_i = this._b_i + (i - buf_size)
+            b_i = this._b_i + (i - this._rebuffer.size())
             if b_i >= this.length:
                 return -1
             else:
@@ -246,7 +244,7 @@ cdef cppclass StateC:
             return 0
         elif this._sent[word].sent_start == 1:
             return 1
-        elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
+        elif this._sent_starts.count(word) >= 1:
             return 1
         else:
             return 0
@@ -330,7 +328,7 @@ cdef cppclass StateC:
         if item >= this._unshiftable.size():
             return 0
         else:
-            return this._unshiftable[item]
+            return this._unshiftable.at(item)
 
     void set_reshiftable(int item) nogil:
         if item < this._unshiftable.size():
@@ -350,9 +348,6 @@ cdef cppclass StateC:
         this._heads[child] = head
 
     void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
-        cdef vector[ArcC]* arcs
-        cdef ArcC* arc
-
         arcs_it = heads_arcs.find(h_i)
         if arcs_it == heads_arcs.end():
             return
@@ -361,12 +356,12 @@ cdef cppclass StateC:
         if arcs.size() == 0:
             return
 
-        arc = &arcs.back()
+        arc = arcs.back()
         if arc.head == h_i and arc.child == c_i:
             arcs.pop_back()
         else:
             for i in range(arcs.size()-1):
-                arc = &deref(arcs)[i]
+                arc = arcs.at(i)
                 if arc.head == h_i and arc.child == c_i:
                     arc.head = -1
                     arc.child = -1
@@ -406,11 +401,10 @@ cdef cppclass StateC:
         this._rebuffer = src._rebuffer
         this._sent_starts = src._sent_starts
         this._unshiftable = src._unshiftable
-        this._heads = src._heads
+        memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
         this._ents = src._ents
         this._left_arcs = src._left_arcs
         this._right_arcs = src._right_arcs
         this._b_i = src._b_i
         this.offset = src.offset
         this._empty_token = src._empty_token
-        this.history = src.history
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 462aa820e4f..9dda3bd5e44 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -779,8 +779,6 @@ cdef class ArcEager(TransitionSystem):
         return list(arcs)
 
     def has_gold(self, Example eg, start=0, end=None):
-        if end is not None and end < 0:
-            end = None
         for word in eg.y[start:end]:
             if word.dep != 0:
                 return True
@@ -865,7 +863,6 @@ cdef class ArcEager(TransitionSystem):
                             state.print_state()
                         )))
                     action.do(state.c, action.label)
-                    state.c.history.push_back(i)
                     break
             else:
                 failed = False
diff --git a/spacy/pipeline/_parser_internals/batch.pxd b/spacy/pipeline/_parser_internals/batch.pxd
deleted file mode 100644
index 60734e549aa..00000000000
--- a/spacy/pipeline/_parser_internals/batch.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef class Batch:
-    pass
diff --git a/spacy/pipeline/_parser_internals/batch.pyx b/spacy/pipeline/_parser_internals/batch.pyx
deleted file mode 100644
index 91073b52e68..00000000000
--- a/spacy/pipeline/_parser_internals/batch.pyx
+++ /dev/null
@@ -1,52 +0,0 @@
-from typing import Any
-
-TransitionSystem = Any  # TODO
-
-cdef class Batch:
-    def advance(self, scores):
-        raise NotImplementedError
-
-    def get_states(self):
-        raise NotImplementedError
-
-    @property
-    def is_done(self):
-        raise NotImplementedError
-
-    def get_unfinished_states(self):
-        raise NotImplementedError
-
-    def __getitem__(self, i):
-        raise NotImplementedError
-
-    def __len__(self):
-        raise NotImplementedError
-
-
-class GreedyBatch(Batch):
-    def __init__(self, moves: TransitionSystem, states, golds):
-        self._moves = moves
-        self._states = states
-        self._next_states = [s for s in states if not s.is_final()]
-
-    def advance(self, scores):
-        self._next_states = self._moves.transition_states(self._next_states, scores)
-
-    def advance_with_actions(self, actions):
-        self._next_states = self._moves.apply_actions(self._next_states, actions)
-
-    def get_states(self):
-        return self._states
-
-    @property
-    def is_done(self):
-        return all(s.is_final() for s in self._states)
-
-    def get_unfinished_states(self):
-        return [st for st in self._states if not st.is_final()]
-
-    def __getitem__(self, i):
-        return self._states[i]
-
-    def __len__(self):
-        return len(self._states)
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index bd4e06dedb3..0a79e77cb86 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -166,7 +166,7 @@ cdef class BiluoPushDown(TransitionSystem):
             if token.ent_type:
                 labels.add(token.ent_type_)
         return labels
-
+    
     def move_name(self, int move, attr_t label):
         if move == OUT:
             return 'O'
@@ -316,8 +316,6 @@ cdef class BiluoPushDown(TransitionSystem):
             for span in eg.y.spans.get(neg_key, []):
                 if span.start >= start and span.end <= end:
                     return True
-        if end is not None and end < 0:
-            end = None
         for word in eg.y[start:end]:
             if word.ent_iob != 0:
                 return True
@@ -653,7 +651,7 @@ cdef class Unit:
                 cost += 1
                 break
         return cost
-
+ 
 
 
 cdef class Out:
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index c2a0d22956a..f25408a13ba 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -21,10 +21,6 @@ cdef class StateClass:
         if self._borrowed != 1:
             del self.c
 
-    @property
-    def history(self):
-        return list(self.c.history)
-
     @property
     def stack(self):
         return [self.S(i) for i in range(self.c.stack_depth())]
@@ -181,6 +177,3 @@ cdef class StateClass:
 
     def clone(self, StateClass src):
         self.c.clone(src.c)
-
-    def set_context_tokens(self, int[:, :] output, int row, int n_feats):
-        self.c.set_context_tokens(&output[row, 0], n_feats)
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index 08baed932ba..04cd10d8864 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -57,10 +57,3 @@ cdef class TransitionSystem:
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        const StateC* state, gold) except -1
-
-
-cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-                          int batch_size) nogil
-
-cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-                             int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index ae1cf890f3e..4a0feb435dd 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -3,8 +3,6 @@
 from __future__ import print_function
 
 from cymem.cymem cimport Pool
-from libc.stdlib cimport calloc, free
-from libcpp.vector cimport vector
 
 from collections import Counter
 
@@ -76,18 +74,7 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
-    def follow_history(self, doc, history):
-        cdef int clas
-        cdef StateClass state = StateClass(doc)
-        for clas in history:
-            action = self.c[clas]
-            action.do(state.c, action.label)
-            state.c.history.push_back(clas)
-        return state
-
     def get_oracle_sequence(self, Example example, _debug=False):
-        if not self.has_gold(example):
-            return []
         states, golds, _ = self.init_gold_batch([example])
         if not states:
             return []
@@ -99,8 +86,6 @@ cdef class TransitionSystem:
             return self.get_oracle_sequence_from_state(state, gold)
 
     def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
-        if state.is_final():
-            return []
         cdef Pool mem = Pool()
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         assert self.n_moves > 0
@@ -126,7 +111,6 @@ cdef class TransitionSystem:
                             "S0 head?", str(state.has_head(state.S(0))),
                         )))
                     action.do(state.c, action.label)
-                    state.c.history.push_back(i)
                     break
             else:
                 if _debug:
@@ -154,28 +138,6 @@ cdef class TransitionSystem:
             raise ValueError(Errors.E170.format(name=name))
         action = self.lookup_transition(name)
         action.do(state.c, action.label)
-        state.c.history.push_back(action.clas)
-
-    def apply_actions(self, states, const int[::1] actions):
-        assert len(states) == actions.shape[0]
-        cdef StateClass state
-        cdef vector[StateC*] c_states
-        c_states.resize(len(states))
-        cdef int i
-        for (i, state) in enumerate(states):
-            c_states[i] = state.c
-        c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
-        return [state for state in states if not state.c.is_final()]
-
-    def transition_states(self, states, float[:, ::1] scores):
-        assert len(states) == scores.shape[0]
-        cdef StateClass state
-        cdef float* c_scores = &scores[0, 0]
-        cdef vector[StateC*] c_states
-        for state in states:
-            c_states.push_back(state.c)
-        c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
-        return [state for state in states if not state.c.is_final()]
 
     cdef Transition lookup_transition(self, object name) except *:
         raise NotImplementedError
@@ -288,34 +250,3 @@ cdef class TransitionSystem:
             self.cfg.update(msg['cfg'])
         self.initialize_actions(labels)
         return self
-
-
-cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
-                          int batch_size) nogil:
-    cdef int i
-    cdef Transition action
-    cdef StateC* state
-    for i in range(batch_size):
-        state = states[i]
-        action = moves.c[actions[i]]
-        action.do(state, action.label)
-        state.history.push_back(action.clas)
-
-
-cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
-                             int nr_class, int batch_size) nogil:
-    is_valid = <int*>calloc(moves.n_moves, sizeof(int))
-    cdef int i, guess
-    cdef Transition action
-    for i in range(batch_size):
-        moves.set_valid(is_valid, states[i])
-        guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
-        if guess == -1:
-            # This shouldn't happen, but it's hard to raise an error here,
-            # and we don't want to infinite loop. So, force to end state.
-            states[i].force_final()
-        else:
-            action = moves.c[guess]
-            action.do(states[i], action.label)
-            states[i].history.push_back(guess)
-    free(is_valid)
diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.pyx
similarity index 96%
rename from spacy/pipeline/dep_parser.py
rename to spacy/pipeline/dep_parser.pyx
index b4961487b83..3e59deaae4f 100644
--- a/spacy/pipeline/dep_parser.py
+++ b/spacy/pipeline/dep_parser.pyx
@@ -4,6 +4,10 @@
 
 from thinc.api import Config, Model
 
+from ._parser_internals.transition_system import TransitionSystem
+from .transition_parser cimport Parser
+from ._parser_internals.arc_eager cimport ArcEager
+
 from ..language import Language
 from ..scorer import Scorer
 from ..training import remove_bilu_prefix
@@ -17,11 +21,12 @@
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -121,7 +126,6 @@ def make_parser(
         scorer=scorer,
     )
 
-
 @Language.factory(
     "beam_parser",
     assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
@@ -227,7 +231,6 @@ def parser_score(examples, **kwargs):
 
     DOCS: https://spacy.io/api/dependencyparser#score
     """
-
     def has_sents(doc):
         return doc.has_annotation("SENT_START")
 
@@ -235,11 +238,8 @@ def dep_getter(token, attr):
         dep = getattr(token, attr)
         dep = token.vocab.strings.as_string(dep).lower()
         return dep
-
     results = {}
-    results.update(
-        Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
-    )
+    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
     kwargs.setdefault("getter", dep_getter)
     kwargs.setdefault("ignore_labels", ("p", "punct"))
     results.update(Scorer.score_deps(examples, "dep", **kwargs))
@@ -252,12 +252,11 @@ def make_parser_scorer():
     return parser_score
 
 
-class DependencyParser(Parser):
+cdef class DependencyParser(Parser):
     """Pipeline component for dependency parsing.
 
     DOCS: https://spacy.io/api/dependencyparser
     """
-
     TransitionSystem = ArcEager
 
     def __init__(
@@ -277,7 +276,8 @@ def __init__(
         incorrect_spans_key=None,
         scorer=parser_score,
     ):
-        """Create a DependencyParser."""
+        """Create a DependencyParser.
+        """
         super().__init__(
             vocab,
             model,
diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.pyx
similarity index 93%
rename from spacy/pipeline/ner.py
rename to spacy/pipeline/ner.pyx
index 1c7cf151385..c73ec5c528a 100644
--- a/spacy/pipeline/ner.py
+++ b/spacy/pipeline/ner.pyx
@@ -10,15 +10,22 @@
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser import Parser
+from .transition_parser cimport Parser
+from ._parser_internals.ner cimport BiluoPushDown
+from ..language import Language
+from ..scorer import get_ner_prf, PRFScore
+from ..util import registry
+from ..training import remove_bilu_prefix
+
 
 default_model_config = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
+use_upper = true
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v2"
@@ -43,12 +50,8 @@
         "incorrect_spans_key": None,
         "scorer": {"@scorers": "spacy.ner_scorer.v1"},
     },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
+
 )
 def make_ner(
     nlp: Language,
@@ -101,7 +104,6 @@ def make_ner(
         scorer=scorer,
     )
 
-
 @Language.factory(
     "beam_ner",
     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
@@ -115,12 +117,7 @@ def make_ner(
         "incorrect_spans_key": None,
         "scorer": None,
     },
-    default_score_weights={
-        "ents_f": 1.0,
-        "ents_p": 0.0,
-        "ents_r": 0.0,
-        "ents_per_type": None,
-    },
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 )
 def make_beam_ner(
     nlp: Language,
@@ -194,12 +191,11 @@ def make_ner_scorer():
     return ner_score
 
 
-class EntityRecognizer(Parser):
+cdef class EntityRecognizer(Parser):
     """Pipeline component for named entity recognition.
 
     DOCS: https://spacy.io/api/entityrecognizer
     """
-
     TransitionSystem = BiluoPushDown
 
     def __init__(
@@ -217,14 +213,15 @@ def __init__(
         incorrect_spans_key=None,
         scorer=ner_score,
     ):
-        """Create an EntityRecognizer."""
+        """Create an EntityRecognizer.
+        """
         super().__init__(
             vocab,
             model,
             name,
             moves,
             update_with_oracle_cut_size=update_with_oracle_cut_size,
-            min_action_freq=1,  # not relevant for NER
+            min_action_freq=1,   # not relevant for NER
             learn_tokens=False,  # not relevant for NER
             beam_width=beam_width,
             beam_density=beam_density,
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index bb9b7653ce3..5d5fbb02790 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,6 +17,7 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
+import logging
 
 from ..util import make_tempdir
 
@@ -413,7 +414,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.initialize(get_examples=lambda: train_examples)
+    nlp.initialize()
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -540,11 +541,11 @@ def test_block_ner():
     assert [token.ent_type_ for token in doc] == expected_types
 
 
-def test_overfitting_IO():
-    fix_random_seed(1)
+@pytest.mark.parametrize("use_upper", [True, False])
+def test_overfitting_IO(use_upper):
     # Simple test to try and quickly overfit the NER component
     nlp = English()
-    ner = nlp.add_pipe("ner", config={"model": {}})
+    ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -576,6 +577,7 @@ def test_overfitting_IO():
         assert ents2[0].label_ == "LOC"
         # Ensure that the predictions are still the same, even after adding a new label
         ner2 = nlp2.get_pipe("ner")
+        assert ner2.model.attrs["has_upper"] == use_upper
         ner2.add_label("RANDOM_NEW_LABEL")
         doc3 = nlp2(test_text)
         ents3 = doc3.ents
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index fe82ad2fde0..f63d56f6922 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,6 +1,3 @@
-import itertools
-
-import numpy
 import pytest
 from numpy.testing import assert_equal
 from thinc.api import Adam, fix_random_seed
@@ -62,8 +59,6 @@
     ),
 ]
 
-PARSERS = ["parser"]  # TODO: Test beam_parser when ready
-
 eps = 0.1
 
 
@@ -176,57 +171,6 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
     assert doc[0].dep != 0
 
 
-def test_parser_apply_actions(en_vocab, en_parser):
-    words = ["I", "ate", "pizza"]
-    words2 = ["Eat", "more", "pizza", "!"]
-    doc1 = Doc(en_vocab, words=words)
-    doc2 = Doc(en_vocab, words=words2)
-    docs = [doc1, doc2]
-
-    moves = en_parser.moves
-    moves.add_action(0, "")
-    moves.add_action(1, "")
-    moves.add_action(2, "nsubj")
-    moves.add_action(3, "obj")
-    moves.add_action(2, "amod")
-
-    actions = [
-        numpy.array([0, 0], dtype="i"),
-        numpy.array([2, 0], dtype="i"),
-        numpy.array([0, 4], dtype="i"),
-        numpy.array([3, 3], dtype="i"),
-        numpy.array([1, 1], dtype="i"),
-        numpy.array([1, 1], dtype="i"),
-        numpy.array([0], dtype="i"),
-        numpy.array([1], dtype="i"),
-    ]
-
-    states = moves.init_batch(docs)
-    active_states = states
-
-    for step_actions in actions:
-        active_states = moves.apply_actions(active_states, step_actions)
-
-    assert len(active_states) == 0
-
-    for (state, doc) in zip(states, docs):
-        moves.set_annotations(state, doc)
-
-    assert docs[0][0].head.i == 1
-    assert docs[0][0].dep_ == "nsubj"
-    assert docs[0][1].head.i == 1
-    assert docs[0][1].dep_ == "ROOT"
-    assert docs[0][2].head.i == 1
-    assert docs[0][2].dep_ == "obj"
-
-    assert docs[1][0].head.i == 0
-    assert docs[1][0].dep_ == "ROOT"
-    assert docs[1][1].head.i == 2
-    assert docs[1][1].dep_ == "amod"
-    assert docs[1][2].head.i == 0
-    assert docs[1][2].dep_ == "obj"
-
-
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
@@ -375,7 +319,7 @@ def test_parser_constructor(en_vocab):
     DependencyParser(en_vocab, model)
 
 
-@pytest.mark.parametrize("pipe_name", PARSERS)
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
 def test_incomplete_data(pipe_name):
     # Test that the parser works with incomplete information
     nlp = English()
@@ -401,15 +345,11 @@ def test_incomplete_data(pipe_name):
     assert doc[2].head.i == 1
 
 
-@pytest.mark.parametrize(
-    "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
-)
-def test_overfitting_IO(pipe_name, max_moves):
-    fix_random_seed(0)
+@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
+def test_overfitting_IO(pipe_name):
     # Simple test to try and quickly overfit the dependency parser (normal or beam)
     nlp = English()
     parser = nlp.add_pipe(pipe_name)
-    parser.cfg["update_with_oracle_cut_size"] = max_moves
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -515,12 +455,10 @@ def test_distill(max_moves):
 @pytest.mark.parametrize(
     "parser_config",
     [
-        # TODO: re-enable after we have a spacy-legacy release for v4. See
-        # https://github.com/explosion/spacy-legacy/pull/36
-        #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        # TransitionBasedParser V1
+        ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
+        # TransitionBasedParser V2
         ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
-        ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
-        ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
     ],
 )
 # fmt: on
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index f6cefbc1f84..c3c4bb6c686 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -384,7 +384,7 @@ def test_replace_listeners():
     factory = "ner"
 
     [components.ner.model]
-    @architectures = "spacy.TransitionBasedParser.v3"
+    @architectures = "spacy.TransitionBasedParser.v2"
 
     [components.ner.model.tok2vec]
     @architectures = "spacy.Tok2VecListener.v1"
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index b351ea80121..8a1c74ca9ed 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -189,11 +189,33 @@
 
 parser_config_string_upper = """
 [model]
-@architectures = "spacy.TransitionBasedParser.v3"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
+use_upper = true
+
+[model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 333
+depth = 4
+embed_size = 5555
+window_size = 1
+maxout_pieces = 7
+subword_features = false
+"""
+
+
+parser_config_string_no_upper = """
+[model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "parser"
+extra_state_tokens = false
+hidden_width = 66
+maxout_pieces = 2
+use_upper = false
 
 [model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
@@ -224,6 +246,7 @@ def my_parser():
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
+        use_upper=True,
     )
     return parser
 
@@ -337,16 +360,15 @@ def test_serialize_custom_nlp():
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        assert model.get_ref("tok2vec") is not None
-        assert model.has_param("hidden_W")
-        assert model.has_param("hidden_b")
-        output = model.get_ref("output")
-        assert output is not None
-        assert output.has_param("W")
-        assert output.has_param("b")
+        model.get_ref("tok2vec")
+        # check that we have the correct settings, not the default ones
+        assert model.get_ref("upper").get_dim("nI") == 65
+        assert model.get_ref("lower").get_dim("nI") == 65
 
 
-@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
+@pytest.mark.parametrize(
+    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
+)
 def test_serialize_parser(parser_config_string):
     """Create a non-default parser config to check nlp serializes it correctly"""
     nlp = English()
@@ -359,13 +381,11 @@ def test_serialize_parser(parser_config_string):
         nlp.to_disk(d)
         nlp2 = spacy.load(d)
         model = nlp2.get_pipe("parser").model
-        assert model.get_ref("tok2vec") is not None
-        assert model.has_param("hidden_W")
-        assert model.has_param("hidden_b")
-        output = model.get_ref("output")
-        assert output is not None
-        assert output.has_param("b")
-        assert output.has_param("W")
+        model.get_ref("tok2vec")
+        # check that we have the correct settings, not the default ones
+        if model.attrs["has_upper"]:
+            assert model.get_ref("upper").get_dim("nI") == 66
+        assert model.get_ref("lower").get_dim("nI") == 66
 
 
 def test_config_nlp_roundtrip():
@@ -561,7 +581,9 @@ def test_config_auto_fill_extra_fields():
     load_model_from_config(nlp.config)
 
 
-@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
+@pytest.mark.parametrize(
+    "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
+)
 def test_config_validate_literal(parser_config_string):
     nlp = English()
     config = Config().from_str(parser_config_string)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index c05ef625e11..44717f6eba2 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,19 +1,22 @@
 import ctypes
 import os
 from pathlib import Path
-
 import pytest
-from pydantic import ValidationError
-from thinc.api import (
-    Config,
-    ConfigValidationError,
-    CupyOps,
-    MPSOps,
-    NumpyOps,
-    Optimizer,
-    get_current_ops,
-    set_current_ops,
-)
+
+try:
+    from pydantic.v1 import ValidationError
+except ImportError:
+    from pydantic import ValidationError  # type: ignore
+
+from spacy.about import __version__ as spacy_version
+from spacy import util
+from spacy import prefer_gpu, require_gpu, require_cpu
+from spacy.ml._precomputable_affine import PrecomputableAffine
+from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
+from spacy.util import dot_to_object, SimpleFrozenList, import_file
+from spacy.util import to_ternary_int, find_available_port
+from thinc.api import Config, Optimizer, ConfigValidationError
+from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util
@@ -96,6 +99,34 @@ def test_util_get_package_path(package):
     assert isinstance(path, Path)
 
 
+def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
+    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
+    assert model.get_param("W").shape == (nF, nO, nP, nI)
+    tensor = model.ops.alloc((10, nI))
+    Y, get_dX = model.begin_update(tensor)
+    assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
+    dY = model.ops.alloc((15, nO, nP))
+    ids = model.ops.alloc((15, nF))
+    ids[1, 2] = -1
+    dY[1] = 1
+    assert not model.has_grad("pad")
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 2, 0, 0] == 1.0
+    ids.fill(0.0)
+    dY.fill(0.0)
+    dY[0] = 0
+    ids[1, 2] = 0
+    ids[1, 1] = -1
+    ids[1, 0] = -1
+    dY[1] = 1
+    ids[2, 0] = -1
+    dY[2] = 5
+    d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
+    assert d_pad[0, 0, 0, 0] == 6
+    assert d_pad[0, 1, 0, 0] == 1
+    assert d_pad[0, 2, 0, 0] == 0
+
+
 def test_prefer_gpu():
     current_ops = get_current_ops()
     if has_cupy_gpu:
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index db8f974ea19..956234ac0d4 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -833,17 +833,18 @@ for a Tok2Vec layer.
 
 ## Parser & NER architectures {id="parser"}
 
-### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
+### spacy.TransitionBasedParser.v2 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TransitionBasedParser.v3"
+> @architectures = "spacy.TransitionBasedParser.v2"
 > state_type = "ner"
 > extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
+> use_upper = true
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v2"
@@ -873,22 +874,23 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                 | Description                                                                                                                                                       |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                        |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                               |
-| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
-| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                            |
-| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~                                                             |
-| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                       |
-| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                     |
+| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
+| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
 
 <Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>
 
 [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
 same signature, but the `use_upper` argument was `True` by default.
 
- </Accordion>
+</Accordion>
 
 ## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx
index 44c80622437..b44df538766 100644
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@@ -302,7 +302,7 @@ the others, but may not be as accurate, especially if texts are short.
 ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 
 Identical to
-[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser)
+[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
 except the `use_upper` was set to `True` by default.
 
 ## Layers {id="layers"}

From 1d0467b262aabe25ba72e8b780cabd776e16e5e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:24:09 +0100
Subject: [PATCH 499/504] isort

---
 spacy/ml/tb_framework.py                    |  3 ++-
 spacy/pipeline/_parser_internals/_state.pxd |  3 +--
 spacy/pipeline/dep_parser.pyx               |  3 ++-
 spacy/pipeline/ner.pyx                      |  9 +++++----
 spacy/tests/parser/test_ner.py              |  1 -
 spacy/tests/test_misc.py                    | 20 +++++++++++---------
 6 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
index ab4a969e24e..e351ad4e570 100644
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@@ -1,6 +1,7 @@
 from thinc.api import Model, noop
-from .parser_model import ParserStepModel
+
 from ..util import registry
+from .parser_model import ParserStepModel
 
 
 @registry.layers("spacy.TransitionModel.v1")
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 04274ce8af1..c063cf97cd4 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -1,4 +1,5 @@
 cimport libcpp
+from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as incr
 from libc.stdint cimport uint32_t, uint64_t
@@ -7,8 +8,6 @@ from libc.string cimport memcpy, memset
 from libcpp.set cimport set
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from libcpp.set cimport set
-from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from murmurhash.mrmr cimport hash64
 
 from ...attrs cimport IS_SPACE
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 3e59deaae4f..1fdc55dab41 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -5,8 +5,9 @@ from typing import Callable, Iterable, Optional
 from thinc.api import Config, Model
 
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
+
 from ._parser_internals.arc_eager cimport ArcEager
+from .transition_parser cimport Parser
 
 from ..language import Language
 from ..scorer import Scorer
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index c73ec5c528a..29f22b0174d 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -10,13 +10,14 @@ from ..training import remove_bilu_prefix, validate_examples
 from ..util import registry
 from ._parser_internals.ner import BiluoPushDown
 from ._parser_internals.transition_system import TransitionSystem
-from .transition_parser cimport Parser
+
 from ._parser_internals.ner cimport BiluoPushDown
+from .transition_parser cimport Parser
+
 from ..language import Language
-from ..scorer import get_ner_prf, PRFScore
-from ..util import registry
+from ..scorer import PRFScore, get_ner_prf
 from ..training import remove_bilu_prefix
-
+from ..util import registry
 
 default_model_config = """
 [model]
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 5d5fbb02790..b6848d380fe 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -17,7 +17,6 @@
 from spacy.tokens import Doc, Span
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.vocab import Vocab
-import logging
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 44717f6eba2..d2a41ff0fed 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -1,6 +1,7 @@
 import ctypes
 import os
 from pathlib import Path
+
 import pytest
 
 try:
@@ -8,15 +9,16 @@
 except ImportError:
     from pydantic import ValidationError  # type: ignore
 
-from spacy.about import __version__ as spacy_version
-from spacy import util
-from spacy import prefer_gpu, require_gpu, require_cpu
-from spacy.ml._precomputable_affine import PrecomputableAffine
-from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
-from spacy.util import dot_to_object, SimpleFrozenList, import_file
-from spacy.util import to_ternary_int, find_available_port
-from thinc.api import Config, Optimizer, ConfigValidationError
-from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
+from thinc.api import (
+    Config,
+    ConfigValidationError,
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Optimizer,
+    get_current_ops,
+    set_current_ops,
+)
 from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 
 from spacy import prefer_gpu, require_cpu, require_gpu, util

From 1b187853552d56a12e292ded79919c5dfa2e952e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 8 Dec 2023 20:38:01 +0100
Subject: [PATCH 500/504] Add distillation tests with max cut size

And fix endless loop when the max cut size is 0 or 1.
---
 spacy/tests/parser/test_ner.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index b6848d380fe..7c3a9d56249 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -624,7 +624,9 @@ def test_is_distillable():
     assert ner.is_distillable
 
 
-def test_distill():
+@pytest.mark.slow
+@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
+def test_distill(max_moves):
     teacher = English()
     teacher_ner = teacher.add_pipe("ner")
     train_examples = []
@@ -642,6 +644,7 @@ def test_distill():
 
     student = English()
     student_ner = student.add_pipe("ner")
+    student_ner.cfg["update_with_oracle_cut_size"] = max_moves
     student_ner.initialize(
         get_examples=lambda: train_examples, labels=teacher_ner.label_data
     )

From c464dc2af489767dae62b608bb5104cf8be9bc10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 18 Dec 2023 20:02:15 +0100
Subject: [PATCH 501/504] Fix Cython lints

---
 spacy/pipeline/_parser_internals/ner.pyx | 4 ++--
 spacy/pipeline/dep_parser.pyx            | 1 +
 spacy/pipeline/ner.pyx                   | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 0a79e77cb86..a4f7094520c 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -166,7 +166,7 @@ cdef class BiluoPushDown(TransitionSystem):
             if token.ent_type:
                 labels.add(token.ent_type_)
         return labels
-    
+
     def move_name(self, int move, attr_t label):
         if move == OUT:
             return 'O'
@@ -651,7 +651,7 @@ cdef class Unit:
                 cost += 1
                 break
         return cost
- 
+
 
 
 cdef class Out:
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 1fdc55dab41..cbd7187ff0f 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -127,6 +127,7 @@ def make_parser(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_parser",
     assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 29f22b0174d..fe54d33a17b 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -15,7 +15,7 @@ from ._parser_internals.ner cimport BiluoPushDown
 from .transition_parser cimport Parser
 
 from ..language import Language
-from ..scorer import PRFScore, get_ner_prf
+from ..scorer import get_ner_prf
 from ..training import remove_bilu_prefix
 from ..util import registry
 
@@ -105,6 +105,7 @@ def make_ner(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_ner",
     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],

From b1673ee1c3de18edf936d466b6a457e6874f2db8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 21 Dec 2023 09:47:38 +0100
Subject: [PATCH 502/504] No need for `Literal` compat, since we only support
 >= 3.8

---
 spacy/compat.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/spacy/compat.py b/spacy/compat.py
index 30459e2e495..1e63807a0e8 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -23,11 +23,6 @@
 except ImportError:
     cupy = None
 
-if sys.version_info[:2] >= (3, 8):  # Python 3.8+
-    from typing import Literal, Protocol, runtime_checkable
-else:
-    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
-
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle

From 189c7c9cf14c2ada4d8aed22297aa63d0751b285 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 16 Jan 2024 14:54:26 +0100
Subject: [PATCH 503/504] Update thinc dependency to 9.0.0.dev4

---
 pyproject.toml                   | 2 +-
 requirements.txt                 | 2 +-
 spacy/pipeline/morphologizer.pyx | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3891d137867..0a5bc162773 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev2,<9.1.0",
+    "thinc>=9.0.0.dev4,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 3a1ef6b70b4..0c852a0d01c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev2,<9.1.0
+thinc>=9.0.0.dev4,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index f822c38ac0e..77c643d4630 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -302,7 +302,8 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
+                                                    label_smoothing=self.cfg["label_smoothing"])
         truths = []
         for eg in examples:
             eg_truths = []

From 16dba1895d60636c37f4550ebe59455ab1a045d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 2 Jan 2024 10:03:06 +0100
Subject: [PATCH 504/504] Add spacy.TextCatParametricAttention.v1 (#13201)

* Add spacy.TextCatParametricAttention.v1

This layer provides is a simplification of the ensemble classifier that
only uses paramteric attention. We have found empirically that with a
sufficient amount of training data, using the ensemble classifier with
BoW does not provide significant improvement in classifier accuracy.
However, plugging in a BoW classifier does reduce GPU training and
inference performance substantially, since it uses a GPU-only kernel.

* Fix merge fallout
---
 pyproject.toml   | 5 +++--
 requirements.txt | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0a5bc162773..bfd7e68d1f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,8 +5,9 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=9.0.0.dev4,<9.1.0",
-    "numpy>=1.15.0",
+    "thinc>=8.2.2,<8.3.0",
+    "numpy>=1.15.0; python_version < '3.9'",
+    "numpy>=1.25.0; python_version >= '3.9'",
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.txt
index 0c852a0d01c..94a9d17c0c3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev1,<4.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=9.0.0.dev4,<9.1.0
+thinc>=8.2.2,<8.3.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0